# Q-Q Plot
# install.packages("ggpubr")
library(ggpubr)
## Loading required package: ggplot2
# Post Hoc Test (dunnTest)
# install.packages("FSA")
library(FSA)
## ## FSA v0.8.30. See citation('FSA') if used in publication.
## ## Run fishR() for related website and fishR('IFAR') for related book.
# Stepwise Regression
# install.packages('MASS')
library(MASS)
# Residual Skewness
# install.packages("moments")
library(moments)
raw.data.09 <- read.csv("C:/Users/alber/Desktop/Ryerson/6. Spring-Summer 2020/Datasets Original/pub0909.csv")
head(raw.data.09)
## REC_NUM SURVYEAR SURVMNTH LFSSTAT PROV CMA AGE_12 AGE_6 SEX MARSTAT ED76to89
## 1 1 2009 9 6 35 4 10 NA 2 1 NA
## 2 2 2009 9 6 48 4 12 NA 1 6 NA
## 3 3 2009 9 6 35 4 12 NA 1 1 NA
## 4 4 2009 9 1 46 4 4 NA 1 6 NA
## 5 5 2009 9 1 35 4 8 NA 2 1 NA
## 6 6 2009 9 6 35 4 9 NA 2 1 NA
## EDUC90 MJH EVERWORK FTPTLAST COWMAIN FILLER1 FILLER2 NAICS_18 NAICS_43
## 1 2 NA 1 1 2 NA NA 11 34
## 2 1 NA 2 NA NA NA NA NA NA
## 3 2 NA 2 NA NA NA NA NA NA
## 4 4 1 NA NA 2 NA NA 17 40
## 5 4 1 NA NA 2 NA NA 8 27
## 6 1 NA 2 NA NA NA NA NA NA
## SOC80_49 SOC80_21 NOCS_01_25 NOCS_01_47 YABSENT WKSAWAY PAYAWAY UHRSMAIN
## 1 NA NA 6 12 NA NA NA NA
## 2 NA NA NA NA NA NA NA NA
## 3 NA NA NA NA NA NA NA NA
## 4 NA NA 17 31 NA NA NA 450
## 5 NA NA 5 10 NA NA NA 375
## 6 NA NA NA NA NA NA NA NA
## AHRSMAIN FTPTMAIN UTOTHRS ATOTHRS HRSAWAY YAWAY PAIDOT UNPAIDOT XTRAHRS
## 1 NA NA NA NA NA NA NA NA NA
## 2 NA NA NA NA NA NA NA NA NA
## 3 NA NA NA NA NA NA NA NA NA
## 4 450 1 450 450 0 NA 0 0 0
## 5 375 1 375 375 0 NA 0 0 0
## 6 NA NA NA NA NA NA NA NA NA
## WHYPTOLD WHYPTNEW TENURE PREVTEN HRLYEARN UNION PERMTEMP ESTSIZE FIRMSIZE
## 1 NA NA NA 240 NA NA NA NA NA
## 2 NA NA NA NA NA NA NA NA NA
## 3 NA NA NA NA NA NA NA NA NA
## 4 NA NA 105 NA 2564 3 1 2 2
## 5 NA NA 14 NA 1949 3 1 2 2
## 6 NA NA NA NA NA NA NA NA NA
## DURUNEMP FLOWUNEM UNEMFTPT WHYLEFTO WHYLEFTN DURJLESS AVAILABL LKPUBAG
## 1 NA NA NA 5 7 3 NA NA
## 2 NA NA NA NA NA 225 NA NA
## 3 NA NA NA NA NA 88 NA NA
## 4 NA NA NA NA NA NA NA NA
## 5 NA NA NA NA NA NA NA NA
## 6 NA NA NA NA NA 36 NA NA
## LKEMPLOY LKRELS LKATADS LKANSADS LKOTHERN PRIORACT YNOLKOLD YNOLOOK TLOLOOK
## 1 NA NA NA NA NA NA NA NA NA
## 2 NA NA NA NA NA NA NA NA NA
## 3 NA NA NA NA NA NA NA NA NA
## 4 NA NA NA NA NA NA NA NA NA
## 5 NA NA NA NA NA NA NA NA NA
## 6 NA NA NA NA NA NA NA NA NA
## SCHOOLN RELREFN EFAMTYPE EFAMSIZE EFAMEMPL EFAMUNEM SP_AGE7 SP_LFSST SPED7689
## 1 1 2 5 2 1 0 6 2 NA
## 2 NA 1 1 1 0 0 NA NA NA
## 3 NA 1 8 2 1 0 6 2 NA
## 4 1 1 1 1 1 0 NA NA NA
## 5 1 1 2 2 2 0 5 1 NA
## 6 1 1 5 2 1 0 7 1 NA
## SPED1990 SP_SOC80 SP_NOCS01 SP_UHRSM SP_UHRST SP_COWM AGYOWNKN SCH1624
## 1 3 NA 12 2 2 2 NA NA
## 2 NA NA NA NA NA NA NA NA
## 3 2 NA 14 1 1 2 NA NA
## 4 NA NA NA NA NA NA NA NA
## 5 1 NA 5 4 4 2 NA NA
## 6 1 NA 21 5 5 2 NA NA
## FINALWT
## 1 181
## 2 424
## 3 676
## 4 90
## 5 415
## 6 160
nrow(raw.data.09)
## [1] 107593
summary(raw.data.09$LFSSTAT)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.000 1.000 1.000 2.925 6.000 6.000
sum(is.na(raw.data.09$LFSSTAT))
## [1] 0
str(raw.data.09$LFSSTAT)
## int [1:107593] 6 6 6 1 1 6 6 6 6 1 ...
# Only Employed Data
data.09 <- as.data.frame(raw.data.09[raw.data.09$LFSSTAT < 3,])
# Only Public and Private Sector Employees Data
data.09 <- as.data.frame(data.09[data.09$COWMAIN < 3,])
# OCCUPATION Variable to match 2019's NOC_10
data.09$NOC_10 <- data.09$NOCS_01_25
sort(unique(data.09$NOC_10))
## [1] 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25
table(data.09$NOC_10)
##
## 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16
## 255 3332 1210 2720 5892 3380 1617 2306 2465 2678 952 1223 4182 2187 982 820
## 17 18 19 20 21 22 23 24 25
## 5184 638 1233 3186 2307 1402 1581 2309 516
data.09$NOC_10[data.09$NOC_10 == 2] <- 1
data.09$NOC_10[data.09$NOC_10 %in% (3:5)] <- 2
data.09$NOC_10[data.09$NOC_10 == 6] <- 3
data.09$NOC_10[data.09$NOC_10 %in% (7:8)] <- 4
data.09$NOC_10[data.09$NOC_10 %in% (9:10)] <- 5
data.09$NOC_10[data.09$NOC_10 == 11] <- 6
data.09$NOC_10[data.09$NOC_10 %in% (12:17)] <- 7
data.09$NOC_10[data.09$NOC_10 %in% (18:22)] <- 8
data.09$NOC_10[data.09$NOC_10 == 23] <- 9
data.09$NOC_10[data.09$NOC_10 %in% (24:25)] <- 10
sort(unique(data.09$NOC_10))
## [1] 1 2 3 4 5 6 7 8 9 10
table(data.09$NOC_10)
##
## 1 2 3 4 5 6 7 8 9 10
## 3587 9822 3380 3923 5143 952 14578 8766 1581 2825
# AGE OF YOUNGEST CHILD
sort(unique(data.09$AGYOWNKN))
## [1] 1 2 3 4 5 6
table(data.09$AGYOWNKN)
##
## 1 2 3 4 5 6
## 4257 2693 6057 2863 1930 3640
data.09$AGYOWNKN[data.09$AGYOWNKN == 2] <- 1
data.09$AGYOWNKN[data.09$AGYOWNKN == 3] <- 2
data.09$AGYOWNKN[data.09$AGYOWNKN %in% (4:5)] <- 3
data.09$AGYOWNKN[data.09$AGYOWNKN == 6] <- 4
sort(unique(data.09$AGYOWNKN))
## [1] 1 2 3 4
table(data.09$AGYOWNKN)
##
## 1 2 3 4
## 6950 6057 4793 3640
# CURRENT STUDENT STATUS
sort(unique(data.09$SCHOOLN))
## [1] 1 2 3 4 5 6 7 8 9
table(data.09$SCHOOLN)
##
## 1 2 3 4 5 6 7 8 9
## 48304 1587 110 1363 654 908 328 88 223
data.09$SCHOOLN[data.09$SCHOOLN %in% seq(from = 2, to = 8, by = 2)] <- 2
data.09$SCHOOLN[data.09$SCHOOLN %in% seq(from = 3, to = 9, by = 2)] <- 3
sort(unique(data.09$SCHOOLN))
## [1] 1 2 3
table(data.09$SCHOOLN)
##
## 1 2 3
## 48304 3946 1315
# Remove Out of Dates Variables
names(data.09)
## [1] "REC_NUM" "SURVYEAR" "SURVMNTH" "LFSSTAT" "PROV"
## [6] "CMA" "AGE_12" "AGE_6" "SEX" "MARSTAT"
## [11] "ED76to89" "EDUC90" "MJH" "EVERWORK" "FTPTLAST"
## [16] "COWMAIN" "FILLER1" "FILLER2" "NAICS_18" "NAICS_43"
## [21] "SOC80_49" "SOC80_21" "NOCS_01_25" "NOCS_01_47" "YABSENT"
## [26] "WKSAWAY" "PAYAWAY" "UHRSMAIN" "AHRSMAIN" "FTPTMAIN"
## [31] "UTOTHRS" "ATOTHRS" "HRSAWAY" "YAWAY" "PAIDOT"
## [36] "UNPAIDOT" "XTRAHRS" "WHYPTOLD" "WHYPTNEW" "TENURE"
## [41] "PREVTEN" "HRLYEARN" "UNION" "PERMTEMP" "ESTSIZE"
## [46] "FIRMSIZE" "DURUNEMP" "FLOWUNEM" "UNEMFTPT" "WHYLEFTO"
## [51] "WHYLEFTN" "DURJLESS" "AVAILABL" "LKPUBAG" "LKEMPLOY"
## [56] "LKRELS" "LKATADS" "LKANSADS" "LKOTHERN" "PRIORACT"
## [61] "YNOLKOLD" "YNOLOOK" "TLOLOOK" "SCHOOLN" "RELREFN"
## [66] "EFAMTYPE" "EFAMSIZE" "EFAMEMPL" "EFAMUNEM" "SP_AGE7"
## [71] "SP_LFSST" "SPED7689" "SPED1990" "SP_SOC80" "SP_NOCS01"
## [76] "SP_UHRSM" "SP_UHRST" "SP_COWM" "AGYOWNKN" "SCH1624"
## [81] "FINALWT" "NOC_10"
data.09 <- subset(data.09, select = -c(ED76to89
, NAICS_43
, SOC80_49
, SOC80_21
, NOCS_01_25
, NOCS_01_47
, WHYPTOLD
, YNOLKOLD
, RELREFN
, EFAMSIZE
, EFAMEMPL
, EFAMUNEM
, SP_AGE7
, SP_LFSST
, SPED7689
, SPED1990
, SP_SOC80
, SP_NOCS01
, SP_UHRSM
, SP_UHRST
, SP_COWM
, SCH1624))
names(data.09)
## [1] "REC_NUM" "SURVYEAR" "SURVMNTH" "LFSSTAT" "PROV" "CMA"
## [7] "AGE_12" "AGE_6" "SEX" "MARSTAT" "EDUC90" "MJH"
## [13] "EVERWORK" "FTPTLAST" "COWMAIN" "FILLER1" "FILLER2" "NAICS_18"
## [19] "YABSENT" "WKSAWAY" "PAYAWAY" "UHRSMAIN" "AHRSMAIN" "FTPTMAIN"
## [25] "UTOTHRS" "ATOTHRS" "HRSAWAY" "YAWAY" "PAIDOT" "UNPAIDOT"
## [31] "XTRAHRS" "WHYPTNEW" "TENURE" "PREVTEN" "HRLYEARN" "UNION"
## [37] "PERMTEMP" "ESTSIZE" "FIRMSIZE" "DURUNEMP" "FLOWUNEM" "UNEMFTPT"
## [43] "WHYLEFTO" "WHYLEFTN" "DURJLESS" "AVAILABL" "LKPUBAG" "LKEMPLOY"
## [49] "LKRELS" "LKATADS" "LKANSADS" "LKOTHERN" "PRIORACT" "YNOLOOK"
## [55] "TLOLOOK" "SCHOOLN" "EFAMTYPE" "AGYOWNKN" "FINALWT" "NOC_10"
# Remove Unemployment and other Variables specified on main report
data.09 <- subset(data.09, select = -c(AGE_6
, EVERWORK
, FTPTLAST
, YABSENT
, WKSAWAY
, PAYAWAY
, UHRSMAIN
, AHRSMAIN
, ATOTHRS
, HRSAWAY
, YAWAY
, PAIDOT
, UNPAIDOT
, XTRAHRS
, WHYPTNEW
, PREVTEN
, DURUNEMP
, FLOWUNEM
, UNEMFTPT
, WHYLEFTO
, WHYLEFTN
, DURJLESS
, AVAILABL
, LKPUBAG
, LKEMPLOY
, LKRELS
, LKATADS
, LKANSADS
, LKOTHERN
, PRIORACT
, YNOLOOK
, TLOLOOK
, FINALWT))
# Rename Variables
names(data.09)[names(data.09) == "EDUC90"] <- "EDUC"
names(data.09)[names(data.09) == "FILLER1"] <- "IMMIG"
names(data.09)[names(data.09) == "FILLER2"] <- "NOC_40"
names(data.09)[names(data.09) == "WHYPTNEW"] <- "WHYPT"
names(data.09)[names(data.09) == "AGYOWNKN"] <- "AGYOWNK"
# Move variables NOC_10, NAICS_18
noc10.idx <- grep("NOC_10", names(data.09))
naics18.idx <- grep("NAICS_18", names(data.09))
data.09 <- data.09[, c(1:13, naics18.idx, noc10.idx, 14, 16:(ncol(data.09)-1))]
names(data.09)
## [1] "REC_NUM" "SURVYEAR" "SURVMNTH" "LFSSTAT" "PROV" "CMA"
## [7] "AGE_12" "SEX" "MARSTAT" "EDUC" "MJH" "COWMAIN"
## [13] "IMMIG" "NAICS_18" "NOC_10" "NOC_40" "FTPTMAIN" "UTOTHRS"
## [19] "TENURE" "HRLYEARN" "UNION" "PERMTEMP" "ESTSIZE" "FIRMSIZE"
## [25] "SCHOOLN" "EFAMTYPE" "AGYOWNK"
raw.data.19 <- read.csv("C:/Users/alber/Desktop/Ryerson/6. Spring-Summer 2020/Datasets Original/pub0919.csv")
head(raw.data.19)
## REC_NUM SURVYEAR SURVMNTH LFSSTAT PROV CMA AGE_12 AGE_6 SEX MARSTAT EDUC MJH
## 1 1 2019 9 2 35 0 7 NA 2 2 4 1
## 2 2 2019 9 1 59 0 4 NA 1 6 4 1
## 3 3 2019 9 1 59 9 3 6 2 6 6 1
## 4 4 2019 9 4 35 0 9 NA 2 1 2 NA
## 5 5 2019 9 1 24 0 6 NA 1 2 4 1
## 6 6 2019 9 1 35 3 8 NA 2 1 4 1
## EVERWORK FTPTLAST COWMAIN IMMIG NAICS_21 NOC_10 NOC_40 YABSENT WKSAWAY
## 1 NA NA 2 3 19 7 26 3 2
## 2 NA NA 2 3 20 2 5 NA NA
## 3 NA NA 2 3 19 7 24 NA NA
## 4 1 2 NA 3 NA NA NA NA NA
## 5 NA NA 5 3 1 1 4 NA NA
## 6 NA NA 1 3 5 7 26 NA NA
## PAYAWAY UHRSMAIN AHRSMAIN FTPTMAIN UTOTHRS ATOTHRS HRSAWAY YAWAY PAIDOT
## 1 2 250 0 2 250 0 NA NA NA
## 2 NA 400 240 1 400 240 160 3 0
## 3 NA 400 400 1 400 400 0 NA 0
## 4 NA NA NA NA NA NA NA NA NA
## 5 NA 700 700 1 700 700 NA NA NA
## 6 NA 363 363 1 363 363 0 NA 0
## UNPAIDOT XTRAHRS WHYPT TENURE PREVTEN HRLYEARN UNION PERMTEMP ESTSIZE
## 1 NA NA 7 30 NA 2500 3 1 2
## 2 0 0 NA 35 NA 6346 3 1 1
## 3 0 0 NA 53 NA 2450 3 1 1
## 4 NA NA NA NA NA NA NA NA NA
## 5 NA NA NA 72 NA NA NA NA NA
## 6 0 0 NA 240 NA 4000 1 1 4
## FIRMSIZE DURUNEMP FLOWUNEM UNEMFTPT WHYLEFTO WHYLEFTN DURJLESS AVAILABL
## 1 4 NA NA NA NA NA NA NA
## 2 4 NA NA NA NA NA NA NA
## 3 1 NA NA NA NA NA NA NA
## 4 NA NA NA NA 1 1 11 NA
## 5 NA NA NA NA NA NA NA NA
## 6 4 NA NA NA NA NA NA NA
## LKPUBAG LKEMPLOY LKRELS LKATADS LKANSADS LKOTHERN PRIORACT YNOLOOK TLOLOOK
## 1 NA NA NA NA NA NA NA NA NA
## 2 NA NA NA NA NA NA NA NA NA
## 3 NA NA NA NA NA NA NA NA NA
## 4 NA NA NA NA NA NA NA NA NA
## 5 NA NA NA NA NA NA NA NA NA
## 6 NA NA NA NA NA NA NA NA NA
## SCHOOLN EFAMTYPE AGYOWNK FINALWT
## 1 1 2 NA 217
## 2 1 1 NA 302
## 3 1 1 NA 195
## 4 1 11 NA 231
## 5 1 3 2 79
## 6 1 3 3 640
nrow(raw.data.19)
## [1] 100011
summary(raw.data.19$LFSSTAT)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.000 1.000 1.000 2.208 4.000 4.000
sum(is.na(raw.data.19$LFSSTAT))
## [1] 0
str(raw.data.19$LFSSTAT)
## int [1:100011] 2 1 1 4 1 1 4 1 1 4 ...
# Only Employed
data.19 <- as.data.frame(raw.data.19[raw.data.19$LFSSTAT < 3,])
# Only Public and Private Sector Employees
data.19 <- as.data.frame(data.19[data.19$COWMAIN < 3,])
# CMA
sort(unique(data.19$CMA))
## [1] 0 1 2 3 4 5 6 7 8 9
table(data.19$CMA)
##
## 0 1 2 3 4 5 6 7 8 9
## 36367 824 2044 848 2974 757 2589 1356 1248 1924
data.19$CMA[data.19$CMA == 2] <- 10
data.19$CMA[data.19$CMA == 4] <- 2
data.19$CMA[data.19$CMA == 9] <- 30
data.19$CMA[data.19$CMA %in% c(1,3,5,6,7,8,0)] <- 4
data.19$CMA[data.19$CMA == 10] <- 1
data.19$CMA[data.19$CMA == 30] <- 3
sort(unique(data.19$CMA))
## [1] 1 2 3 4
table(data.19$CMA)
##
## 1 2 3 4
## 2044 2974 1924 43989
# NAICS_21 TO NAICS_18 (2009)
sort(unique(data.19$NAICS_21))
## [1] 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21
table(data.19$NAICS_21)
##
## 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16
## 709 202 59 1220 495 3746 2736 2308 1590 6125 2559 1863 643 2494 1621 4465
## 17 18 19 20 21
## 7432 1858 3540 1716 3550
data.19$NAICS_21[data.19$NAICS_21 %in% (2:4)] <- 2
data.19$NAICS_21[data.19$NAICS_21 == 5] <- 3
data.19$NAICS_21[data.19$NAICS_21 == 6] <- 4
data.19$NAICS_21[data.19$NAICS_21 == 7] <- 5
data.19$NAICS_21[data.19$NAICS_21 == 8] <- 6
data.19$NAICS_21[data.19$NAICS_21 == 9] <- 7
data.19$NAICS_21[data.19$NAICS_21 == 10] <- 8
data.19$NAICS_21[data.19$NAICS_21 == 11] <- 9
data.19$NAICS_21[data.19$NAICS_21 %in% (12:13)] <- 10
data.19$NAICS_21[data.19$NAICS_21 == 14] <- 11
data.19$NAICS_21[data.19$NAICS_21 == 15] <- 12
data.19$NAICS_21[data.19$NAICS_21 == 16] <- 13
data.19$NAICS_21[data.19$NAICS_21 == 17] <- 14
data.19$NAICS_21[data.19$NAICS_21 == 18] <- 15
data.19$NAICS_21[data.19$NAICS_21 == 19] <- 16
data.19$NAICS_21[data.19$NAICS_21 == 20] <- 17
data.19$NAICS_21[data.19$NAICS_21 == 21] <- 18
names(data.19)[names(data.19) == "NAICS_21"] <- "NAICS_18"
sort(unique(data.19$NAICS_18))
## [1] 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18
table(data.19$NAICS_18)
##
## 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16
## 709 1481 495 3746 2736 2308 1590 6125 2559 2506 2494 1621 4465 7432 1858 3540
## 17 18
## 1716 3550
# Remove Unemployment and other Variables specified on main report
names(data.19)
## [1] "REC_NUM" "SURVYEAR" "SURVMNTH" "LFSSTAT" "PROV" "CMA"
## [7] "AGE_12" "AGE_6" "SEX" "MARSTAT" "EDUC" "MJH"
## [13] "EVERWORK" "FTPTLAST" "COWMAIN" "IMMIG" "NAICS_18" "NOC_10"
## [19] "NOC_40" "YABSENT" "WKSAWAY" "PAYAWAY" "UHRSMAIN" "AHRSMAIN"
## [25] "FTPTMAIN" "UTOTHRS" "ATOTHRS" "HRSAWAY" "YAWAY" "PAIDOT"
## [31] "UNPAIDOT" "XTRAHRS" "WHYPT" "TENURE" "PREVTEN" "HRLYEARN"
## [37] "UNION" "PERMTEMP" "ESTSIZE" "FIRMSIZE" "DURUNEMP" "FLOWUNEM"
## [43] "UNEMFTPT" "WHYLEFTO" "WHYLEFTN" "DURJLESS" "AVAILABL" "LKPUBAG"
## [49] "LKEMPLOY" "LKRELS" "LKATADS" "LKANSADS" "LKOTHERN" "PRIORACT"
## [55] "YNOLOOK" "TLOLOOK" "SCHOOLN" "EFAMTYPE" "AGYOWNK" "FINALWT"
data.19 <- subset(data.19, select = -c(AGE_6
, EVERWORK
, FTPTLAST
, YABSENT
, WKSAWAY
, PAYAWAY
, UHRSMAIN
, AHRSMAIN
, ATOTHRS
, HRSAWAY
, YAWAY
, PAIDOT
, UNPAIDOT
, XTRAHRS
, WHYPT
, PREVTEN
, DURUNEMP
, FLOWUNEM
, UNEMFTPT
, WHYLEFTO
, WHYLEFTN
, DURJLESS
, AVAILABL
, LKPUBAG
, LKEMPLOY
, LKRELS
, LKATADS
, LKANSADS
, LKOTHERN
, PRIORACT
, YNOLOOK
, TLOLOOK
, FINALWT))
# Check both data sets have the same variables
names(data.19)
## [1] "REC_NUM" "SURVYEAR" "SURVMNTH" "LFSSTAT" "PROV" "CMA"
## [7] "AGE_12" "SEX" "MARSTAT" "EDUC" "MJH" "COWMAIN"
## [13] "IMMIG" "NAICS_18" "NOC_10" "NOC_40" "FTPTMAIN" "UTOTHRS"
## [19] "TENURE" "HRLYEARN" "UNION" "PERMTEMP" "ESTSIZE" "FIRMSIZE"
## [25] "SCHOOLN" "EFAMTYPE" "AGYOWNK"
names(data.09)
## [1] "REC_NUM" "SURVYEAR" "SURVMNTH" "LFSSTAT" "PROV" "CMA"
## [7] "AGE_12" "SEX" "MARSTAT" "EDUC" "MJH" "COWMAIN"
## [13] "IMMIG" "NAICS_18" "NOC_10" "NOC_40" "FTPTMAIN" "UTOTHRS"
## [19] "TENURE" "HRLYEARN" "UNION" "PERMTEMP" "ESTSIZE" "FIRMSIZE"
## [25] "SCHOOLN" "EFAMTYPE" "AGYOWNK"
data.all <- rbind(data.09, data.19)
str(data.all)
## 'data.frame': 105488 obs. of 27 variables:
## $ REC_NUM : int 4 5 10 11 12 15 16 18 19 21 ...
## $ SURVYEAR: int 2009 2009 2009 2009 2009 2009 2009 2009 2009 2009 ...
## $ SURVMNTH: int 9 9 9 9 9 9 9 9 9 9 ...
## $ LFSSTAT : int 1 1 1 1 1 1 1 1 1 1 ...
## $ PROV : int 46 35 35 24 12 48 24 12 48 59 ...
## $ CMA : num 4 4 4 4 4 4 4 4 4 3 ...
## $ AGE_12 : int 4 8 6 4 3 9 8 2 2 10 ...
## $ SEX : int 1 2 1 2 1 1 1 1 2 2 ...
## $ MARSTAT : int 6 1 1 1 2 1 2 6 1 6 ...
## $ EDUC : int 4 4 2 4 5 1 4 2 4 1 ...
## $ MJH : int 1 1 1 2 1 1 1 1 1 1 ...
## $ COWMAIN : int 2 2 2 2 1 2 2 2 2 2 ...
## $ IMMIG : int NA NA NA NA NA NA NA NA NA NA ...
## $ NAICS_18: num 17 8 5 14 13 6 4 12 14 14 ...
## $ NOC_10 : num 7 2 10 5 5 8 8 2 4 7 ...
## $ NOC_40 : int NA NA NA NA NA NA NA NA NA NA ...
## $ FTPTMAIN: int 1 1 1 2 2 1 1 1 1 1 ...
## $ UTOTHRS : int 450 375 400 496 240 400 370 375 400 450 ...
## $ TENURE : int 105 14 115 36 7 237 39 11 29 39 ...
## $ HRLYEARN: int 2564 1949 3750 1690 667 2550 2600 1380 2100 1700 ...
## $ UNION : int 3 3 1 3 1 1 1 3 3 1 ...
## $ PERMTEMP: int 1 1 1 1 4 1 1 1 1 1 ...
## $ ESTSIZE : int 2 2 4 1 4 4 4 3 1 1 ...
## $ FIRMSIZE: int 2 2 4 1 4 4 4 4 1 1 ...
## $ SCHOOLN : num 1 1 1 1 1 1 1 1 1 1 ...
## $ EFAMTYPE: int 1 2 3 2 2 5 5 1 2 1 ...
## $ AGYOWNK : num NA NA 2 NA NA NA NA NA NA NA ...
summary(data.all)
## REC_NUM SURVYEAR SURVMNTH LFSSTAT PROV
## Min. : 1 Min. :2009 Min. :9 Min. :1.000 Min. :10.00
## 1st Qu.: 25978 1st Qu.:2009 1st Qu.:9 1st Qu.:1.000 1st Qu.:24.00
## Median : 51785 Median :2009 Median :9 Median :1.000 Median :35.00
## Mean : 51925 Mean :2014 Mean :9 Mean :1.071 Mean :35.28
## 3rd Qu.: 77717 3rd Qu.:2019 3rd Qu.:9 3rd Qu.:1.000 3rd Qu.:47.00
## Max. :107593 Max. :2019 Max. :9 Max. :2.000 Max. :59.00
##
## CMA AGE_12 SEX MARSTAT
## Min. :1.000 Min. : 1.000 Min. :1.000 Min. :1.000
## 1st Qu.:4.000 1st Qu.: 4.000 1st Qu.:1.000 1st Qu.:1.000
## Median :4.000 Median : 6.000 Median :2.000 Median :2.000
## Mean :3.726 Mean : 5.765 Mean :1.503 Mean :2.877
## 3rd Qu.:4.000 3rd Qu.: 8.000 3rd Qu.:2.000 3rd Qu.:6.000
## Max. :4.000 Max. :12.000 Max. :2.000 Max. :6.000
##
## EDUC MJH COWMAIN IMMIG
## Min. :0.000 Min. :1.000 Min. :1.000 Min. :1.00
## 1st Qu.:2.000 1st Qu.:1.000 1st Qu.:1.000 1st Qu.:3.00
## Median :4.000 Median :1.000 Median :2.000 Median :3.00
## Mean :3.474 Mean :1.055 Mean :1.733 Mean :2.77
## 3rd Qu.:4.000 3rd Qu.:1.000 3rd Qu.:2.000 3rd Qu.:3.00
## Max. :6.000 Max. :2.000 Max. :2.000 Max. :3.00
## NA's :54557
## NAICS_18 NOC_10 NOC_40 FTPTMAIN
## Min. : 1.00 Min. : 1.000 Min. : 1.00 Min. :1.000
## 1st Qu.: 7.00 1st Qu.: 3.000 1st Qu.:10.00 1st Qu.:1.000
## Median :11.00 Median : 7.000 Median :22.00 Median :1.000
## Mean :10.63 Mean : 5.425 Mean :19.91 Mean :1.183
## 3rd Qu.:14.00 3rd Qu.: 7.000 3rd Qu.:28.00 3rd Qu.:1.000
## Max. :18.00 Max. :10.000 Max. :40.00 Max. :2.000
## NA's :54557
## UTOTHRS TENURE HRLYEARN UNION
## Min. : 4.0 Min. : 1.00 Min. : 200 Min. :1.000
## 1st Qu.:350.0 1st Qu.: 15.00 1st Qu.: 1500 1st Qu.:1.000
## Median :400.0 Median : 53.00 Median : 2100 Median :3.000
## Mean :362.3 Mean : 86.02 Mean : 2437 Mean :2.353
## 3rd Qu.:400.0 3rd Qu.:143.00 3rd Qu.: 3077 3rd Qu.:3.000
## Max. :990.0 Max. :240.00 Max. :11538 Max. :3.000
##
## PERMTEMP ESTSIZE FIRMSIZE SCHOOLN
## Min. :1.000 Min. :1.000 Min. :1.000 Min. :1.000
## 1st Qu.:1.000 1st Qu.:1.000 1st Qu.:2.000 1st Qu.:1.000
## Median :1.000 Median :2.000 Median :3.000 Median :1.000
## Mean :1.275 Mean :2.094 Mean :2.935 Mean :1.117
## 3rd Qu.:1.000 3rd Qu.:3.000 3rd Qu.:4.000 3rd Qu.:1.000
## Max. :4.000 Max. :4.000 Max. :4.000 Max. :3.000
## NA's :3002
## EFAMTYPE AGYOWNK
## Min. : 1.000 Min. :1.00
## 1st Qu.: 2.000 1st Qu.:1.00
## Median : 3.000 Median :2.00
## Mean : 4.899 Mean :2.21
## 3rd Qu.: 5.000 3rd Qu.:3.00
## Max. :18.000 Max. :4.00
## NA's :65144
# SCHOOLN
data.all$SCHOOLN[is.na(data.all$SCHOOLN)] <- 4
summary(data.all$SCHOOLN)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.000 1.000 1.000 1.199 1.000 4.000
#AGYOWNK
data.all$AGYOWNK[is.na(data.all$AGYOWNK)] <- 5
summary(data.all$AGYOWNK)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.000 3.000 5.000 3.933 5.000 5.000
# Labor Force Status
unique(data.all$LFSSTAT)
## [1] 1 2
str(data.all$LFSSTAT)
## int [1:105488] 1 1 1 1 1 1 1 1 1 1 ...
data.all$LFSSTAT <- factor(data.all$LFSSTAT,
levels = c(1, 2),
labels = c("Employed, at work", "Employed, absent from work"))
str(data.all$LFSSTAT)
## Factor w/ 2 levels "Employed, at work",..: 1 1 1 1 1 1 1 1 1 1 ...
# Province
data.all$PROV <- factor(data.all$PROV,
levels = c(10, 11, 12, 13, 24, 35, 46, 47, 48, 59),
labels = c("NL"
, "PEI"
, "NS"
, "NB"
, "QC"
, "ON"
, "MB"
, "SK"
, "AB"
, "BC"))
str(data.all$PROV)
## Factor w/ 10 levels "NL","PEI","NS",..: 7 6 6 5 3 9 5 3 9 10 ...
# CMA
sort(unique(data.all$CMA))
## [1] 1 2 3 4
str(data.all$CMA)
## num [1:105488] 4 4 4 4 4 4 4 4 4 3 ...
data.all$CMA <- factor(data.all$CMA,
levels = c(1, 2, 3, 4),
labels = c("Montreal", "Toronto", "Vancouver", "Other"))
str(data.all$CMA)
## Factor w/ 4 levels "Montreal","Toronto",..: 4 4 4 4 4 4 4 4 4 3 ...
# Age 12 groups
sort(unique(data.all$AGE_12))
## [1] 1 2 3 4 5 6 7 8 9 10 11 12
str(data.all$AGE_12)
## int [1:105488] 4 8 6 4 3 9 8 2 2 10 ...
data.all$AGE_12 <- factor(data.all$AGE_12,
levels = c(1:12),
labels = c("15-19"
, "20-24"
, "25-29"
, "30-34"
, "35-39"
, "40-44"
, "45-49"
, "50-54"
, "55-59"
, "60-64"
, "65-69"
, "70-over"),
ordered = TRUE)
str(data.all$AGE_12)
## Ord.factor w/ 12 levels "15-19"<"20-24"<..: 4 8 6 4 3 9 8 2 2 10 ...
# Sex
sort(unique(data.all$SEX))
## [1] 1 2
str(data.all$SEX)
## int [1:105488] 1 2 1 2 1 1 1 1 2 2 ...
data.all$SEX <- factor(data.all$SEX,
levels = c(1, 2),
labels = c("Male", "Female"))
str(data.all$SEX)
## Factor w/ 2 levels "Male","Female": 1 2 1 2 1 1 1 1 2 2 ...
# Marital Status
sort(unique(data.all$MARSTAT))
## [1] 1 2 3 4 5 6
str(data.all$MARSTAT)
## int [1:105488] 6 1 1 1 2 1 2 6 1 6 ...
data.all$MARSTAT <- factor(data.all$MARSTAT,
levels = c(1:6),
labels = c("Married"
, "Common-law" # Living in common-law
, "Widowed"
, "Separated"
, "Divorced"
, "Single, NM")) # Single, never married
str(data.all$MARSTAT)
## Factor w/ 6 levels "Married","Common-law",..: 6 1 1 1 2 1 2 6 1 6 ...
# Education Attainment
sort(unique(data.all$EDUC))
## [1] 0 1 2 3 4 5 6
str(data.all$EDUC)
## int [1:105488] 4 4 2 4 5 1 4 2 4 1 ...
data.all$EDUC <- factor(data.all$EDUC,
levels = c(0:6),
labels = c("0 to 8 years"
, "Some high school"
, "High school graduate"
, "Some postsecondary"
, "Postsecondary certificate or diploma"
, "Bachelor's degree"
, "Above bachelor's degree"),
ordered = TRUE)
str(data.all$EDUC)
## Ord.factor w/ 7 levels "0 to 8 years"<..: 5 5 3 5 6 2 5 3 5 2 ...
# Add Education column with SHORT NAMES
data.all$EDUCshort <- data.all$EDUC
levels(data.all$EDUCshort) <- list(No.HS = "0 to 8 years"
, Some.HS = "Some high school"
, HS.grad = "High school graduate"
, Some.Post = "Some postsecondary"
, Post.cert = "Postsecondary certificate or diploma"
, Bachelor = "Bachelor's degree"
, Above.B = "Above bachelor's degree")
str(data.all$EDUCshort)
## Ord.factor w/ 7 levels "No.HS"<"Some.HS"<..: 5 5 3 5 6 2 5 3 5 2 ...
# Single or Multiple Jobholder
sort(unique(data.all$MJH))
## [1] 1 2
str(data.all$MJH)
## int [1:105488] 1 1 1 2 1 1 1 1 1 1 ...
data.all$MJH <- factor(data.all$MJH,
levels = c(1,2),
labels = c("Single jobholder", "Multiple jobholder"))
str(data.all$MJH)
## Factor w/ 2 levels "Single jobholder",..: 1 1 1 2 1 1 1 1 1 1 ...
# Class of Worker, Main Job
sort(unique(data.all$COWMAIN))
## [1] 1 2
str(data.all$COWMAIN)
## int [1:105488] 2 2 2 2 1 2 2 2 2 2 ...
data.all$COWMAIN <- factor(data.all$COWMAIN,
levels = c(1,2),
labels = c("Public sector", "Private sector"))
str(data.all$COWMAIN)
## Factor w/ 2 levels "Public sector",..: 2 2 2 2 1 2 2 2 2 2 ...
# Immigrant Status
sort(unique(data.all$IMMIG))
## [1] 1 2 3
str(data.all$IMMIG)
## int [1:105488] NA NA NA NA NA NA NA NA NA NA ...
data.all$IMMIG <- factor(data.all$IMMIG,
levels = c(1,2,3),
labels = c("Immigrant, landed =< 10 years"
, "Immigrant, landed > 10 years"
, "Non-immigrant"))
str(data.all$IMMIG)
## Factor w/ 3 levels "Immigrant, landed =< 10 years",..: NA NA NA NA NA NA NA NA NA NA ...
summary(data.all$IMMIG)
## Immigrant, landed =< 10 years Immigrant, landed > 10 years
## 3236 5232
## Non-immigrant NA's
## 42463 54557
# Industry of main job
sort(unique(data.all$NAICS_18))
## [1] 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18
str(data.all$NAICS_18)
## num [1:105488] 17 8 5 14 13 6 4 12 14 14 ...
data.all$NAICS_18 <- factor(data.all$NAICS_18,
levels = c(1:18),
labels = c("Agriculture"
, "Forestry, Fishing, Min., Oil & Gas"
, "Utilities"
, "Construction"
, "Manufacturing durables"
, "Manufacturing non-durables"
, "Wholesale Trade"
, "Retail Trade"
, "Transportation & Warehousing"
, "Finance, Insurance, Real Est. & Leas."
, "Prof., Scientific & Technical Services"
, "Management, Admin. & Support"
, "Educational Services"
, "Health Care & Social Assistance"
, "Information, Culture & Recreation"
, "Accommodation & Food Services"
, "Other Services"
, "Public Administration"))
str(data.all$NAICS_18)
## Factor w/ 18 levels "Agriculture",..: 17 8 5 14 13 6 4 12 14 14 ...
# Add Industry column with SHORT NAMES
data.all$NAICS_18short <- data.all$NAICS_18
levels(data.all$NAICS_18short) <- list(Agri = "Agriculture"
, Fores = "Forestry, Fishing, Min., Oil & Gas" # "Forestry, Fishing, Mining, Oil and Gas"
, Utils = "Utilities"
, Const = "Construction"
, ManuD = "Manufacturing durables"
, ManuN = "Manufacturing non-durables"
, Whole = "Wholesale Trade"
, Rtail = "Retail Trade"
, Trans = "Transportation & Warehousing"
, Finan = "Finance, Insurance, Real Est. & Leas." # "Finance, Insurance, Real Estate and Leasing"
, ProSc = "Prof., Scientific & Technical Services" # "Professional, Scientific and Technical Services"
, Mngt = "Management, Admin. & Support" # "Management, Administrative and Other Support"
, Educa = "Educational Services"
, Health = "Health Care & Social Assistance"
, Info = "Information, Culture & Recreation"
, AcFood = "Accommodation & Food Services"
, Other = "Other Services"
, PubAd = "Public Administration")
str(data.all$NAICS_18short)
## Factor w/ 18 levels "Agri","Fores",..: 17 8 5 14 13 6 4 12 14 14 ...
# Occupation at main job (10 categories)
sort(unique(data.all$NOC_10))
## [1] 1 2 3 4 5 6 7 8 9 10
str(data.all$NOC_10)
## num [1:105488] 7 2 10 5 5 8 8 2 4 7 ...
data.all$NOC_10 <- factor(data.all$NOC_10,
levels = c(1:10),
labels = c("Management"
, "Business, finance & administration"
, "Natural & applied sciences"
, "Health"
, "Educ., law, community & gov. services" # "Education, law and social, community and government services"
, "Art, culture, recreation & sport"
, "Sales & service"
, "Trades, transport & equipm. operators" # "Trades, transport and equipment operators"
, "Natural resources & agriculture"
, "Manufacturing & utilities"))
str(data.all$NOC_10)
## Factor w/ 10 levels "Management","Business, finance & administration",..: 7 2 10 5 5 8 8 2 4 7 ...
summary(data.all$NOC_10)
## Management Business, finance & administration
## 6543 17907
## Natural & applied sciences Health
## 6963 8000
## Educ., law, community & gov. services Art, culture, recreation & sport
## 11514 1813
## Sales & service Trades, transport & equipm. operators
## 27360 16821
## Natural resources & agriculture Manufacturing & utilities
## 3153 5414
# Add Occupation column with SHORT NAMES
data.all$NOC_10short <- data.all$NOC_10
levels(data.all$NOC_10short) <- list(Mngt = "Management"
, BusFin = "Business, finance & administration"
, NatASc = "Natural & applied sciences"
, Health ="Health"
, EduLaw = "Educ., law, community & gov. services"
, ArtCul = "Art, culture, recreation & sport"
, Sales = "Sales & service"
, Trades = "Trades, transport & equipm. operators"
, NatAgri = "Natural resources & agriculture"
, ManUtil = "Manufacturing & utilities")
str(data.all$NOC_10short)
## Factor w/ 10 levels "Mngt","BusFin",..: 7 2 10 5 5 8 8 2 4 7 ...
# Occupation at main job (40 categories)
sort(unique(data.all$NOC_40))
## [1] 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25
## [26] 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40
str(data.all$NOC_40)
## int [1:105488] NA NA NA NA NA NA NA NA NA NA ...
data.all$NOC_40 <- factor(data.all$NOC_40,
levels = c(1:40),
labels = c("Senior management"
,"Specialized middle management"
,"Middle management in retail and wholesale trade and customer services"
,"Middle management in trades, transportation, production and utilities"
,"Professional occupations in business and finance"
,"Administrative and financial supervisors and administrative occupations"
,"Finance, insurance and related business administrative occupations"
,"Office support"
,"Distribution, tracking and scheduling co-ordination"
,"Professional occupations in natural and applied sciences"
,"Technical occupations related to natural and applied sciences"
,"Professional occupations in nursing"
,"Professional occupations in health (except nursing)"
,"Technical occupations in health"
,"Assisting occupations in support of health services"
,"Professional occupations in education services"
,"Professional occupations in law and social, community and government services"
,"Paraprofessional occupations in legal, social, community and education services"
,"Front-line public protection services"
,"Care providers and educational, legal and public protection support"
,"Professional occupations in art and culture"
,"Technical occupations in art, culture, recreation and sport"
,"Retail sales supervisors and specialized sales"
,"Service supervisors and specialized service"
,"Sales representatives and salespersons - wholesale and retail trade"
,"Service representatives and other customer and personal services occupations"
,"Sales support"
,"Service support and other service occupations, n.e.c."
,"Industrial, electrical and construction trades"
,"Maintenance and equipment operation trades"
,"Other installers, repairers and servicers and material handlers"
,"Transport and heavy equipment operation and related maintenance occupations"
,"Trades helpers, construction labourers and related occupations"
,"Supervisors and technical occupations in natural resources, agriculture and related production"
,"Workers in natural resources, agriculture and related production"
,"Harvesting, landscaping and natural resources labourers"
,"Processing, manufacturing and utilities supervisors and central control operators"
,"Processing and manufacturing machine operators and related production workers"
,"Assemblers in manufacturing"
,"Labourers in processing, manufacturing and utilities"))
str(data.all$NOC_40)
## Factor w/ 40 levels "Senior management",..: NA NA NA NA NA NA NA NA NA NA ...
summary(data.all$NOC_40)
## Senior management
## 138
## Specialized middle management
## 1490
## Middle management in retail and wholesale trade and customer services
## 626
## Middle management in trades, transportation, production and utilities
## 702
## Professional occupations in business and finance
## 1576
## Administrative and financial supervisors and administrative occupations
## 3078
## Finance, insurance and related business administrative occupations
## 621
## Office support
## 1952
## Distribution, tracking and scheduling co-ordination
## 858
## Professional occupations in natural and applied sciences
## 1827
## Technical occupations related to natural and applied sciences
## 1756
## Professional occupations in nursing
## 1115
## Professional occupations in health (except nursing)
## 499
## Technical occupations in health
## 1230
## Assisting occupations in support of health services
## 1233
## Professional occupations in education services
## 2479
## Professional occupations in law and social, community and government services
## 1377
## Paraprofessional occupations in legal, social, community and education services
## 1245
## Front-line public protection services
## 355
## Care providers and educational, legal and public protection support
## 915
## Professional occupations in art and culture
## 229
## Technical occupations in art, culture, recreation and sport
## 632
## Retail sales supervisors and specialized sales
## 1548
## Service supervisors and specialized service
## 1855
## Sales representatives and salespersons - wholesale and retail trade
## 2199
## Service representatives and other customer and personal services occupations
## 2354
## Sales support
## 1926
## Service support and other service occupations, n.e.c.
## 2900
## Industrial, electrical and construction trades
## 2557
## Maintenance and equipment operation trades
## 1883
## Other installers, repairers and servicers and material handlers
## 802
## Transport and heavy equipment operation and related maintenance occupations
## 2205
## Trades helpers, construction labourers and related occupations
## 608
## Supervisors and technical occupations in natural resources, agriculture and related production
## 583
## Workers in natural resources, agriculture and related production
## 616
## Harvesting, landscaping and natural resources labourers
## 373
## Processing, manufacturing and utilities supervisors and central control operators
## 638
## Processing and manufacturing machine operators and related production workers
## 949
## Assemblers in manufacturing
## 515
## Labourers in processing, manufacturing and utilities
## 487
## NA's
## 54557
# Full time or Part time, main job
sort(unique(data.all$FTPTMAIN))
## [1] 1 2
str(data.all$FTPTMAIN)
## int [1:105488] 1 1 1 2 2 1 1 1 1 1 ...
data.all$FTPTMAIN <- factor(data.all$FTPTMAIN,
levels = c(1,2),
labels = c("Full-time", "Part-time"))
str(data.all$FTPTMAIN)
## Factor w/ 2 levels "Full-time","Part-time": 1 1 1 2 2 1 1 1 1 1 ...
summary(data.all$FTPTMAIN)
## Full-time Part-time
## 86157 19331
# Union
sort(unique(data.all$UNION))
## [1] 1 2 3
str(data.all$UNION)
## int [1:105488] 3 3 1 3 1 1 1 3 3 1 ...
data.all$UNION <- factor(data.all$UNION,
levels = c(1:3),
labels = c("Union member"
, "Not member but covered" # Not a member but covered by a union contract
, "Non-unionized"))
str(data.all$UNION)
## Factor w/ 3 levels "Union member",..: 3 3 1 3 1 1 1 3 3 1 ...
summary(data.all$UNION)
## Union member Not member but covered Non-unionized
## 33074 2116 70298
# Job permanency
sort(unique(data.all$PERMTEMP))
## [1] 1 2 3 4
str(data.all$PERMTEMP)
## int [1:105488] 1 1 1 1 4 1 1 1 1 1 ...
data.all$PERMTEMP <- factor(data.all$PERMTEMP,
levels = c(1:4),
labels = c("Permanent"
, "Temp. season" # Temporary, seasonal
, "Temp. contract" # Temporary, term or contract
, "Temp. casual")) # Temporary, casual or other
str(data.all$PERMTEMP)
## Factor w/ 4 levels "Permanent","Temp. season",..: 1 1 1 1 4 1 1 1 1 1 ...
summary(data.all$PERMTEMP)
## Permanent Temp. season Temp. contract Temp. casual
## 90764 4305 6583 3836
# Establishment Size (number of employees)
sort(unique(data.all$ESTSIZE))
## [1] 1 2 3 4
str(data.all$ESTSIZE)
## int [1:105488] 2 2 4 1 4 4 4 3 1 1 ...
data.all$ESTSIZE <- factor(data.all$ESTSIZE,
levels = c(1:4),
labels = c("<20"
, "20-99"
, "100-500"
, ">500"),
ordered = TRUE)
str(data.all$ESTSIZE)
## Ord.factor w/ 4 levels "<20"<"20-99"<..: 2 2 4 1 4 4 4 3 1 1 ...
summary(data.all$ESTSIZE)
## <20 20-99 100-500 >500
## 36269 35799 20604 12816
# Firm Size (number of employees)
sort(unique(data.all$FIRMSIZE))
## [1] 1 2 3 4
str(data.all$FIRMSIZE)
## int [1:105488] 2 2 4 1 4 4 4 4 1 1 ...
data.all$FIRMSIZE <- factor(data.all$FIRMSIZE,
levels = c(1:4),
labels = c("<20"
, "20-99"
, "100-500"
, ">500"),
ordered = TRUE)
str(data.all$FIRMSIZE)
## Ord.factor w/ 4 levels "<20"<"20-99"<..: 2 2 4 1 4 4 4 4 1 1 ...
summary(data.all$FIRMSIZE)
## <20 20-99 100-500 >500
## 20662 17463 15467 51896
# Current Student Status
sort(unique(data.all$SCHOOLN))
## [1] 1 2 3 4
str(data.all$SCHOOLN)
## num [1:105488] 1 1 1 1 1 1 1 1 1 1 ...
data.all$SCHOOLN <- factor(data.all$SCHOOLN,
levels = c(1:4),
labels = c("Non-student"
, "Full-time student"
, "Part-time student"
, "Unknown"))
str(data.all$SCHOOLN)
## Factor w/ 4 levels "Non-student",..: 1 1 1 1 1 1 1 1 1 1 ...
summary(data.all$SCHOOLN)
## Non-student Full-time student Part-time student Unknown
## 92872 7260 2354 3002
# Type of Economic Family
# Labels meaning:
# Ind: Unattached individual
# HWDENC: Husband-wife, dual earner couple, no children or none under 25
# HWDE17: Husband-wife, dual earner couple, youngest child 0 to 17
# HWDE24: Husband-wife, dual earner couple, youngest child 18 to 24
# HWSHNC: Husband-wife, single earner couple, husband employed, no children or none under 25
# HWSH17: Husband-wife, single earner couple, husband employed, youngest child 0 to 17
# HWSH24: Husband-wife, single earner couple, husband employed, youngest child 18 to 24
# HWSWNC: Husband-wife, single earner couple, wife employed, no children or none under 25
# HWSW17: Husband-wife, single earner couple, wife employed, youngest child 0 to 17
# HWSW24: Husband-wife, single earner couple, wife employed, youngest child 18 to 24
# HWNENC: Husband-wife, non-earner couple, no children or none under 25
# HWNE17: Husband-wife, non-earner couple, youngest child 0 to 17
# HWNE24: Husband-wife, non-earner couple, youngest child 18 to 24
# SPE17: Single-parent family, parent employed, youngest child 0 to 17
# SPE24: Single-parent family, parent employed, youngest child 18 to 24
# SPN17: Single-parent family, parent not employed, youngest child 0 to 17
# SPN24: Single-parent family, parent not employed, youngest child 18 to 24
# Other: Other families
sort(unique(data.all$EFAMTYPE))
## [1] 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18
str(data.all$EFAMTYPE)
## int [1:105488] 1 2 3 2 2 5 5 1 2 1 ...
data.all$EFAMTYPE <- factor(data.all$EFAMTYPE,
levels = c(1:18),
labels = c("Ind"
, "HWDENC"
, "HWDE17"
, "HWDE24"
, "HWSHNC"
, "HWSH17"
, "HWSH24"
, "HWSWNC"
, "HWSW17"
, "HWSW24"
, "HWNENC"
, "HWNE17"
, "HWNE24"
, "SPE17"
, "SPE24"
, "SPN17"
, "SPN24"
, "Other"))
str(data.all$EFAMTYPE)
## Factor w/ 18 levels "Ind","HWDENC",..: 1 2 3 2 2 5 5 1 2 1 ...
summary(data.all$EFAMTYPE)
## Ind HWDENC HWDE17 HWDE24 HWSHNC HWSH17 HWSH24 HWSWNC HWSW17 HWSW24 HWNENC
## 16842 22136 28911 7892 4053 4009 986 3423 1355 712 749
## HWNE17 HWNE24 SPE17 SPE24 SPN17 SPN24 Other
## 109 181 4991 2024 267 189 6659
# Age of Youngest Child (Years)
sort(unique(data.all$AGYOWNK))
## [1] 1 2 3 4 5
str(data.all$AGYOWNK)
## num [1:105488] 5 5 2 5 5 5 5 5 5 5 ...
data.all$AGYOWNK <- factor(data.all$AGYOWNK,
levels = c(1:5),
labels = c(">6"
, "6-12"
, "13-17"
, "18-24"
, ">24|NC"), # |NC means OR No Children
ordered = TRUE)
str(data.all$AGYOWNK)
## Ord.factor w/ 5 levels ">6"<"6-12"<"13-17"<..: 5 5 2 5 5 5 5 5 5 5 ...
summary(data.all$AGYOWNK)
## >6 6-12 13-17 18-24 >24|NC
## 13510 11724 8270 6840 65144
str(data.all)
## 'data.frame': 105488 obs. of 30 variables:
## $ REC_NUM : int 4 5 10 11 12 15 16 18 19 21 ...
## $ SURVYEAR : int 2009 2009 2009 2009 2009 2009 2009 2009 2009 2009 ...
## $ SURVMNTH : int 9 9 9 9 9 9 9 9 9 9 ...
## $ LFSSTAT : Factor w/ 2 levels "Employed, at work",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ PROV : Factor w/ 10 levels "NL","PEI","NS",..: 7 6 6 5 3 9 5 3 9 10 ...
## $ CMA : Factor w/ 4 levels "Montreal","Toronto",..: 4 4 4 4 4 4 4 4 4 3 ...
## $ AGE_12 : Ord.factor w/ 12 levels "15-19"<"20-24"<..: 4 8 6 4 3 9 8 2 2 10 ...
## $ SEX : Factor w/ 2 levels "Male","Female": 1 2 1 2 1 1 1 1 2 2 ...
## $ MARSTAT : Factor w/ 6 levels "Married","Common-law",..: 6 1 1 1 2 1 2 6 1 6 ...
## $ EDUC : Ord.factor w/ 7 levels "0 to 8 years"<..: 5 5 3 5 6 2 5 3 5 2 ...
## $ MJH : Factor w/ 2 levels "Single jobholder",..: 1 1 1 2 1 1 1 1 1 1 ...
## $ COWMAIN : Factor w/ 2 levels "Public sector",..: 2 2 2 2 1 2 2 2 2 2 ...
## $ IMMIG : Factor w/ 3 levels "Immigrant, landed =< 10 years",..: NA NA NA NA NA NA NA NA NA NA ...
## $ NAICS_18 : Factor w/ 18 levels "Agriculture",..: 17 8 5 14 13 6 4 12 14 14 ...
## $ NOC_10 : Factor w/ 10 levels "Management","Business, finance & administration",..: 7 2 10 5 5 8 8 2 4 7 ...
## $ NOC_40 : Factor w/ 40 levels "Senior management",..: NA NA NA NA NA NA NA NA NA NA ...
## $ FTPTMAIN : Factor w/ 2 levels "Full-time","Part-time": 1 1 1 2 2 1 1 1 1 1 ...
## $ UTOTHRS : int 450 375 400 496 240 400 370 375 400 450 ...
## $ TENURE : int 105 14 115 36 7 237 39 11 29 39 ...
## $ HRLYEARN : int 2564 1949 3750 1690 667 2550 2600 1380 2100 1700 ...
## $ UNION : Factor w/ 3 levels "Union member",..: 3 3 1 3 1 1 1 3 3 1 ...
## $ PERMTEMP : Factor w/ 4 levels "Permanent","Temp. season",..: 1 1 1 1 4 1 1 1 1 1 ...
## $ ESTSIZE : Ord.factor w/ 4 levels "<20"<"20-99"<..: 2 2 4 1 4 4 4 3 1 1 ...
## $ FIRMSIZE : Ord.factor w/ 4 levels "<20"<"20-99"<..: 2 2 4 1 4 4 4 4 1 1 ...
## $ SCHOOLN : Factor w/ 4 levels "Non-student",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ EFAMTYPE : Factor w/ 18 levels "Ind","HWDENC",..: 1 2 3 2 2 5 5 1 2 1 ...
## $ AGYOWNK : Ord.factor w/ 5 levels ">6"<"6-12"<"13-17"<..: 5 5 2 5 5 5 5 5 5 5 ...
## $ EDUCshort : Ord.factor w/ 7 levels "No.HS"<"Some.HS"<..: 5 5 3 5 6 2 5 3 5 2 ...
## $ NAICS_18short: Factor w/ 18 levels "Agri","Fores",..: 17 8 5 14 13 6 4 12 14 14 ...
## $ NOC_10short : Factor w/ 10 levels "Mngt","BusFin",..: 7 2 10 5 5 8 8 2 4 7 ...
# NUMERIC VARIABLES REAL VALUES
data.all$UTOTHRS <- data.all$UTOTHRS/10
data.all$HRLYEARN <- data.all$HRLYEARN/100
# NUMERIC VARIABLES SUMMARY
num.vars <- c("UTOTHRS", "TENURE", "HRLYEARN")
summary(data.all[data.all$SURVYEAR == 2009, num.vars])
## UTOTHRS TENURE HRLYEARN
## Min. : 0.40 Min. : 1.0 Min. : 2.00
## 1st Qu.:35.00 1st Qu.: 15.0 1st Qu.: 13.00
## Median :40.00 Median : 51.0 Median : 19.00
## Mean :36.13 Mean : 85.3 Mean : 21.47
## 3rd Qu.:40.00 3rd Qu.:139.0 3rd Qu.: 27.24
## Max. :99.00 Max. :240.0 Max. :115.38
summary(data.all[data.all$SURVYEAR == 2019, num.vars])
## UTOTHRS TENURE HRLYEARN
## Min. : 1.00 Min. : 1.00 Min. : 3.00
## 1st Qu.:35.00 1st Qu.: 15.00 1st Qu.: 17.00
## Median :40.00 Median : 55.00 Median : 24.00
## Mean :36.34 Mean : 86.79 Mean : 27.48
## 3rd Qu.:40.00 3rd Qu.:146.00 3rd Qu.: 34.97
## Max. :99.00 Max. :240.00 Max. :107.96
sd(data.all$UTOTHRS[data.all$SURVYEAR == 2009])
## [1] 11.23495
sd(data.all$UTOTHRS[data.all$SURVYEAR == 2019])
## [1] 11.16521
sd(data.all$TENURE[data.all$SURVYEAR == 2009])
## [1] 83.43826
sd(data.all$TENURE[data.all$SURVYEAR == 2019])
## [1] 82.60483
sd(data.all$HRLYEARN[data.all$SURVYEAR == 2009])
## [1] 11.27651
sd(data.all$HRLYEARN[data.all$SURVYEAR == 2019])
## [1] 13.66143
# DESCRIPTIVE STATISTICS BY YEAR AND GENDER
summary(data.all[data.all$SURVYEAR == 2009 & data.all$SEX == "Male", num.vars])
## UTOTHRS TENURE HRLYEARN
## Min. : 0.40 Min. : 1.00 Min. : 2.14
## 1st Qu.:37.50 1st Qu.: 15.00 1st Qu.: 14.50
## Median :40.00 Median : 51.00 Median : 20.51
## Mean :39.01 Mean : 86.29 Mean : 23.25
## 3rd Qu.:40.00 3rd Qu.:144.00 3rd Qu.: 29.80
## Max. :99.00 Max. :240.00 Max. :115.38
summary(data.all[data.all$SURVYEAR == 2019 & data.all$SEX == "Male", num.vars])
## UTOTHRS TENURE HRLYEARN
## Min. : 1.00 Min. : 1.00 Min. : 3.00
## 1st Qu.:37.50 1st Qu.: 15.00 1st Qu.: 18.00
## Median :40.00 Median : 53.00 Median : 25.65
## Mean :38.76 Mean : 85.67 Mean : 29.29
## 3rd Qu.:40.00 3rd Qu.:144.00 3rd Qu.: 37.00
## Max. :99.00 Max. :240.00 Max. :107.96
summary(data.all[data.all$SURVYEAR == 2009 & data.all$SEX == "Female", num.vars])
## UTOTHRS TENURE HRLYEARN
## Min. : 0.40 Min. : 1.00 Min. : 2.00
## 1st Qu.:30.00 1st Qu.: 16.00 1st Qu.:11.75
## Median :37.00 Median : 50.00 Median :17.14
## Mean :33.33 Mean : 84.35 Mean :19.74
## 3rd Qu.:40.00 3rd Qu.:135.00 3rd Qu.:24.91
## Max. :99.00 Max. :240.00 Max. :89.74
summary(data.all[data.all$SURVYEAR == 2019 & data.all$SEX == "Female", num.vars])
## UTOTHRS TENURE HRLYEARN
## Min. : 1.00 Min. : 1.00 Min. : 3.07
## 1st Qu.:30.00 1st Qu.: 16.00 1st Qu.: 16.00
## Median :37.50 Median : 57.00 Median : 22.00
## Mean :33.91 Mean : 87.91 Mean : 25.66
## 3rd Qu.:40.00 3rd Qu.:148.00 3rd Qu.: 31.79
## Max. :99.00 Max. :240.00 Max. :106.67
apply(data.all[data.all$SURVYEAR == 2009 & data.all$SEX == "Male", num.vars], 2, sd)
## UTOTHRS TENURE HRLYEARN
## 10.94336 84.67368 11.94321
apply(data.all[data.all$SURVYEAR == 2009 & data.all$SEX == "Female", num.vars], 2, sd)
## UTOTHRS TENURE HRLYEARN
## 10.79760 82.20528 10.29534
apply(data.all[data.all$SURVYEAR == 2019 & data.all$SEX == "Male", num.vars], 2, sd)
## UTOTHRS TENURE HRLYEARN
## 11.04963 82.52859 14.45311
apply(data.all[data.all$SURVYEAR == 2019 & data.all$SEX == "Female", num.vars], 2, sd)
## UTOTHRS TENURE HRLYEARN
## 10.74592 82.66763 12.55978
# OUTLIERS IN NUMERIC VARIABLES
hours <- boxplot(UTOTHRS ~ SURVYEAR + SEX, data = data.all, main = "Usual hours worked")
hours[c("stats", "n", "names")]
## $stats
## [,1] [,2] [,3] [,4]
## [1,] 33.8 33.8 15 15.0
## [2,] 37.5 37.5 30 30.0
## [3,] 40.0 40.0 37 37.5
## [4,] 40.0 40.0 40 40.0
## [5,] 43.5 43.7 55 55.0
##
## $n
## [1] 26942 25499 27615 25432
##
## $names
## [1] "2009.Male" "2019.Male" "2009.Female" "2019.Female"
table(hours$group)
##
## 1 2 3 4
## 8453 8080 2324 2096
tenure <- boxplot(TENURE ~ SURVYEAR + SEX, data = data.all, main = "Job tenure with current employer")
tenure[c("stats","n")]
## $stats
## [,1] [,2] [,3] [,4]
## [1,] 1 1 1 1
## [2,] 15 15 16 16
## [3,] 51 53 50 57
## [4,] 144 144 135 148
## [5,] 240 240 240 240
## attr(,"class")
## 2009.Male
## "integer"
##
## $n
## [1] 26942 25499 27615 25432
table(tenure$group)
## < table of extent 0 >
wage <- boxplot(HRLYEARN ~ SURVYEAR + SEX, data = data.all, main = "Usual hourly wages")
wage[c("stats","n")]
## $stats
## [,1] [,2] [,3] [,4]
## [1,] 2.14 3.00 2.00 3.07
## [2,] 14.50 18.00 11.75 16.00
## [3,] 20.51 25.65 17.14 22.00
## [4,] 29.80 37.00 24.91 31.79
## [5,] 52.69 65.38 44.62 55.38
##
## $n
## [1] 26942 25499 27615 25432
table(wage$group)
##
## 1 2 3 4
## 675 618 708 681
table(data.all$SURVYEAR, data.all$SEX)
##
## Male Female
## 2009 26942 27615
## 2019 25499 25432
data.all.09male <- data.all[data.all$SURVYEAR == 2009 & data.all$SEX == "Male",]
data.all.19male <- data.all[data.all$SURVYEAR == 2019 & data.all$SEX == "Male",]
data.all.09fem <- data.all[data.all$SURVYEAR == 2009 & data.all$SEX == "Female",]
data.all.19fem <- data.all[data.all$SURVYEAR == 2019 & data.all$SEX == "Female",]
# HISTOGRAMS ###################################################################
# par(mfrow=c(2, 2))
par(mar=c(4.1, 4.1, 1.1, 1.1))
# Usual hours worked
# Males 2009
hist(data.all.09male$UTOTHRS
, xlim = c(0, 100)
, ylim = c(0, 0.25)
, breaks = 50
, freq = FALSE
, cex.lab = 1.5
, cex.axis = 1.5
, col = "slategray2"
, main = ""
, xlab = "Usual hours worked")
curve(dnorm(x, mean = mean(data.all.09male$UTOTHRS)
, sd = sd(data.all.09male$UTOTHRS))
, col = "royalblue4"
, lwd = 2
, add = TRUE)
# Males 2019
hist(data.all.19male$UTOTHRS
, xlim = c(0, 100)
, ylim = c(0, 0.25)
, breaks = 50
, freq = FALSE
, cex.lab = 1.5
, cex.axis = 1.5
, col = "slategray2"
, main = ""
, xlab = "Usual hours worked")
curve(dnorm(x, mean = mean(data.all.19male$UTOTHRS)
, sd = sd(data.all.19male$UTOTHRS))
, col = "royalblue4"
, lwd = 2
, add = TRUE)
# Females 2009
hist(data.all.09fem$UTOTHRS
, xlim = c(0, 100)
, ylim = c(0, 0.25)
, breaks = 50
, freq = FALSE
, cex.lab = 1.5
, cex.axis = 1.5
, col = "moccasin"
, main = ""
, xlab = "Usual hours worked")
curve(dnorm(x, mean = mean(data.all.09fem$UTOTHRS)
, sd = sd(data.all.09fem$UTOTHRS))
, col = "darkorange2"
, lwd = 2
, add = TRUE)
# Females 2019
hist(data.all.19fem$UTOTHRS
, xlim = c(0, 100)
, ylim = c(0, 0.25)
, breaks = 50
, freq = FALSE
, cex.lab = 1.5
, cex.axis = 1.5
, col = "moccasin"
, main = ""
, xlab = "Usual hours worked")
curve(dnorm(x, mean = mean(data.all.19fem$UTOTHRS)
, sd = sd(data.all.19fem$UTOTHRS))
, col = "darkorange2"
, lwd = 2
, add = TRUE)
# Tenure
# Males 2009
hist(data.all.09male$TENURE
, xlim = c(0, 240)
, ylim = c(0, 0.03)
, breaks = 50
, freq = FALSE
, cex.lab = 1.5
, cex.axis = 1.5
, col = "slategray2"
, main = ""
, xlab = "Tenure with current employer in months")
curve(dnorm(x, mean = mean(data.all.09male$TENURE)
, sd = sd(data.all.09male$TENURE))
, col = "royalblue4"
, lwd = 2
, add = TRUE)
# Males 2019
hist(data.all.19male$TENURE
, xlim = c(0, 240)
, ylim = c(0, 0.03)
, breaks = 50
, freq = FALSE
, cex.lab = 1.5
, cex.axis = 1.5
, col = "slategray2"
, main = ""
, xlab = "Tenure with current employer in months")
curve(dnorm(x, mean = mean(data.all.19male$TENURE)
, sd = sd(data.all.19male$TENURE))
, col = "royalblue4"
, lwd = 2
, add = TRUE)
# Females 2009
hist(data.all.09fem$TENURE
, xlim = c(0, 240)
, ylim = c(0, 0.03)
, breaks = 50
, freq = FALSE
, cex.lab = 1.5
, cex.axis = 1.5
, col = "moccasin"
, main = ""
, xlab = "Tenure with current employer in months")
curve(dnorm(x, mean = mean(data.all.09fem$TENURE)
, sd = sd(data.all.09fem$TENURE))
, col = "darkorange2"
, lwd = 2
, add = TRUE)
# Females 2019
hist(data.all.19fem$TENURE
, xlim = c(0, 240)
, ylim = c(0, 0.03)
, breaks = 50
, freq = FALSE
, cex.lab = 1.5
, cex.axis = 1.5
, col = "moccasin"
, main = ""
, xlab = "Tenure with current employer in months")
curve(dnorm(x, mean = mean(data.all.19fem$TENURE)
, sd = sd(data.all.19fem$TENURE))
, col = "darkorange2"
, lwd = 2
, add = TRUE)
# Wages
# Males 2009
hist(data.all.09male$HRLYEARN
, xlim = c(0, 120)
, ylim = c(0, 0.085)
, breaks = 50
, freq = FALSE
, cex.lab = 1.5
, cex.axis = 1.5
, col = "slategray2"
, main = ""
, xlab = "Usual hourly wages")
curve(dnorm(x, mean = mean(data.all.09male$HRLYEARN)
, sd = sd(data.all.09male$HRLYEARN))
, col = "royalblue4"
, lwd = 2
, add = TRUE)
# Males 2019
hist(data.all.19male$HRLYEARN
, xlim = c(0, 120)
, ylim = c(0, 0.085)
, breaks = 50
, freq = FALSE
, cex.lab = 1.5
, cex.axis = 1.5
, col = "slategray2"
, main = ""
, xlab = "Usual hourly wages")
curve(dnorm(x, mean = mean(data.all.19male$HRLYEARN)
, sd = sd(data.all.19male$HRLYEARN))
, col = "royalblue4"
, lwd = 2
, add = TRUE)
# Females 2009
hist(data.all.09fem$HRLYEARN
, xlim = c(0, 120)
, ylim = c(0, 0.085)
, breaks = 50
, freq = FALSE
, col = "moccasin"
, main = ""
, xlab = "Usual hourly wages")
curve(dnorm(x, mean = mean(data.all.09fem$HRLYEARN)
, sd = sd(data.all.09fem$HRLYEARN))
, col = "darkorange2"
, lwd = 2
, add = TRUE)
# Females 2019
hist(data.all.19fem$HRLYEARN
, xlim = c(0, 120)
, ylim = c(0, 0.085)
, breaks = 50
, freq = FALSE
, cex.lab = 1.5
, cex.axis = 1.5
, col = "moccasin"
, main = ""
, xlab = "Usual hourly wages")
curve(dnorm(x, mean = mean(data.all.19fem$HRLYEARN)
, sd = sd(data.all.19fem$HRLYEARN))
, col = "darkorange2"
, lwd = 2
, add = TRUE)
# QQ PLOTS #####################################################################
# Usual hours worked
ggqqplot(data.all.09male$UTOTHRS, title = "Males 2009")
ggqqplot(data.all.19male$UTOTHRS, title = "Males 2019")
ggqqplot(data.all.09fem$UTOTHRS, title = "Females 2009")
ggqqplot(data.all.19male$UTOTHRS, title = "Females 2019")
# Tenure
ggqqplot(data.all.09male$TENURE, title = "Males 2009")
ggqqplot(data.all.19male$TENURE, title = "Males 2019")
ggqqplot(data.all.09fem$TENURE, title = "Females 2009")
ggqqplot(data.all.19male$TENURE, title = "Females 2019")
# Wages
ggqqplot(data.all.09male$HRLYEARN, title = "Males 2009")
ggqqplot(data.all.19male$HRLYEARN, title = "Males 2019")
ggqqplot(data.all.09fem$HRLYEARN, title = "Females 2009")
ggqqplot(data.all.19male$HRLYEARN, title = "Females 2019")
# KOLMOGOROV-SMIRNOV ###########################################################
# Usual Hours Worked
ks.test(data.all.09male$UTOTHRS, "pnorm", mean = mean(data.all.09male$UTOTHRS), sd = sd(data.all.09male$UTOTHRS))
## Warning in ks.test(data.all.09male$UTOTHRS, "pnorm", mean = mean(data.all.
## 09male$UTOTHRS), : ties should not be present for the Kolmogorov-Smirnov test
##
## One-sample Kolmogorov-Smirnov test
##
## data: data.all.09male$UTOTHRS
## D = 0.266, p-value < 2.2e-16
## alternative hypothesis: two-sided
ks.test(data.all.19male$UTOTHRS, "pnorm", mean = mean(data.all.19male$UTOTHRS), sd = sd(data.all.19male$UTOTHRS))
## Warning in ks.test(data.all.19male$UTOTHRS, "pnorm", mean = mean(data.all.
## 19male$UTOTHRS), : ties should not be present for the Kolmogorov-Smirnov test
##
## One-sample Kolmogorov-Smirnov test
##
## data: data.all.19male$UTOTHRS
## D = 0.26344, p-value < 2.2e-16
## alternative hypothesis: two-sided
ks.test(data.all.09fem$UTOTHRS, "pnorm", mean = mean(data.all.09fem$UTOTHRS), sd = sd(data.all.09fem$UTOTHRS))
## Warning in ks.test(data.all.09fem$UTOTHRS, "pnorm", mean = mean(data.all.
## 09fem$UTOTHRS), : ties should not be present for the Kolmogorov-Smirnov test
##
## One-sample Kolmogorov-Smirnov test
##
## data: data.all.09fem$UTOTHRS
## D = 0.21438, p-value < 2.2e-16
## alternative hypothesis: two-sided
ks.test(data.all.19fem$UTOTHRS, "pnorm", mean = mean(data.all.19fem$UTOTHRS), sd = sd(data.all.19fem$UTOTHRS))
## Warning in ks.test(data.all.19fem$UTOTHRS, "pnorm", mean = mean(data.all.
## 19fem$UTOTHRS), : ties should not be present for the Kolmogorov-Smirnov test
##
## One-sample Kolmogorov-Smirnov test
##
## data: data.all.19fem$UTOTHRS
## D = 0.21143, p-value < 2.2e-16
## alternative hypothesis: two-sided
# Tenure
ks.test(data.all.09male$TENURE, "pnorm", mean = mean(data.all.09male$TENURE), sd = sd(data.all.09male$TENURE))
## Warning in ks.test(data.all.09male$TENURE, "pnorm", mean = mean(data.all.
## 09male$TENURE), : ties should not be present for the Kolmogorov-Smirnov test
##
## One-sample Kolmogorov-Smirnov test
##
## data: data.all.09male$TENURE
## D = 0.16707, p-value < 2.2e-16
## alternative hypothesis: two-sided
ks.test(data.all.19male$TENURE, "pnorm", mean = mean(data.all.19male$TENURE), sd = sd(data.all.19male$TENURE))
## Warning in ks.test(data.all.19male$TENURE, "pnorm", mean = mean(data.all.
## 19male$TENURE), : ties should not be present for the Kolmogorov-Smirnov test
##
## One-sample Kolmogorov-Smirnov test
##
## data: data.all.19male$TENURE
## D = 0.15484, p-value < 2.2e-16
## alternative hypothesis: two-sided
ks.test(data.all.09fem$TENURE, "pnorm", mean = mean(data.all.09fem$TENURE), sd = sd(data.all.09fem$TENURE))
## Warning in ks.test(data.all.09fem$TENURE, "pnorm", mean = mean(data.all.
## 09fem$TENURE), : ties should not be present for the Kolmogorov-Smirnov test
##
## One-sample Kolmogorov-Smirnov test
##
## data: data.all.09fem$TENURE
## D = 0.16497, p-value < 2.2e-16
## alternative hypothesis: two-sided
ks.test(data.all.19fem$TENURE, "pnorm", mean = mean(data.all.19fem$TENURE), sd = sd(data.all.19fem$TENURE))
## Warning in ks.test(data.all.19fem$TENURE, "pnorm", mean = mean(data.all.
## 19fem$TENURE), : ties should not be present for the Kolmogorov-Smirnov test
##
## One-sample Kolmogorov-Smirnov test
##
## data: data.all.19fem$TENURE
## D = 0.1523, p-value < 2.2e-16
## alternative hypothesis: two-sided
# Wage
ks.test(data.all.09male$HRLYEARN, "pnorm", mean = mean(data.all.09male$HRLYEARN), sd = sd(data.all.09male$HRLYEARN))
## Warning in ks.test(data.all.09male$HRLYEARN, "pnorm", mean = mean(data.all.
## 09male$HRLYEARN), : ties should not be present for the Kolmogorov-Smirnov test
##
## One-sample Kolmogorov-Smirnov test
##
## data: data.all.09male$HRLYEARN
## D = 0.098075, p-value < 2.2e-16
## alternative hypothesis: two-sided
ks.test(data.all.19male$HRLYEARN, "pnorm", mean = mean(data.all.19male$HRLYEARN), sd = sd(data.all.19male$HRLYEARN))
## Warning in ks.test(data.all.19male$HRLYEARN, "pnorm", mean = mean(data.all.
## 19male$HRLYEARN), : ties should not be present for the Kolmogorov-Smirnov test
##
## One-sample Kolmogorov-Smirnov test
##
## data: data.all.19male$HRLYEARN
## D = 0.10788, p-value < 2.2e-16
## alternative hypothesis: two-sided
ks.test(data.all.09fem$HRLYEARN, "pnorm", mean = mean(data.all.09fem$HRLYEARN), sd = sd(data.all.09fem$HRLYEARN))
## Warning in ks.test(data.all.09fem$HRLYEARN, "pnorm", mean = mean(data.all.
## 09fem$HRLYEARN), : ties should not be present for the Kolmogorov-Smirnov test
##
## One-sample Kolmogorov-Smirnov test
##
## data: data.all.09fem$HRLYEARN
## D = 0.11895, p-value < 2.2e-16
## alternative hypothesis: two-sided
ks.test(data.all.19fem$HRLYEARN, "pnorm", mean = mean(data.all.19fem$HRLYEARN), sd = sd(data.all.19fem$HRLYEARN))
## Warning in ks.test(data.all.19fem$HRLYEARN, "pnorm", mean = mean(data.all.
## 19fem$HRLYEARN), : ties should not be present for the Kolmogorov-Smirnov test
##
## One-sample Kolmogorov-Smirnov test
##
## data: data.all.19fem$HRLYEARN
## D = 0.12557, p-value < 2.2e-16
## alternative hypothesis: two-sided
# Pearson (Only numeric)
round(cor(data.all.09male[num.vars]), 2)
## UTOTHRS TENURE HRLYEARN
## UTOTHRS 1.00 0.11 0.14
## TENURE 0.11 1.00 0.30
## HRLYEARN 0.14 0.30 1.00
round(cor(data.all.19male[num.vars]), 2)
## UTOTHRS TENURE HRLYEARN
## UTOTHRS 1.00 0.12 0.17
## TENURE 0.12 1.00 0.30
## HRLYEARN 0.17 0.30 1.00
round(cor(data.all.09fem[num.vars]), 2)
## UTOTHRS TENURE HRLYEARN
## UTOTHRS 1.00 0.18 0.20
## TENURE 0.18 1.00 0.35
## HRLYEARN 0.20 0.35 1.00
round(cor(data.all.19fem[num.vars]), 2)
## UTOTHRS TENURE HRLYEARN
## UTOTHRS 1.00 0.14 0.19
## TENURE 0.14 1.00 0.34
## HRLYEARN 0.19 0.34 1.00
# Spearmen (Numeric and Ordinal)
ord.vars <- c("AGE_12", "EDUC", "ESTSIZE", "FIRMSIZE", "AGYOWNK")
ord <- lapply(data.all[ord.vars], as.numeric)
ord.num <- cbind(data.all[num.vars], ord)
ord.num <- cbind(data.all[c("SURVYEAR", "SEX")], ord.num)
round(cor(ord.num[ord.num$SURVYEAR == 2009 & ord.num$SEX == "Male", 3:10], method="spearman"), 2)
## UTOTHRS TENURE HRLYEARN AGE_12 EDUC ESTSIZE FIRMSIZE AGYOWNK
## UTOTHRS 1.00 0.10 0.14 0.13 -0.01 -0.04 -0.10 -0.13
## TENURE 0.10 1.00 0.37 0.47 0.09 0.20 0.19 -0.14
## HRLYEARN 0.14 0.37 1.00 0.31 0.37 0.27 0.24 -0.24
## AGE_12 0.13 0.47 0.31 1.00 0.10 0.09 0.06 -0.04
## EDUC -0.01 0.09 0.37 0.10 1.00 0.16 0.15 -0.16
## ESTSIZE -0.04 0.20 0.27 0.09 0.16 1.00 0.60 -0.07
## FIRMSIZE -0.10 0.19 0.24 0.06 0.15 0.60 1.00 -0.05
## AGYOWNK -0.13 -0.14 -0.24 -0.04 -0.16 -0.07 -0.05 1.00
round(cor(ord.num[ord.num$SURVYEAR == 2019 & ord.num$SEX == "Male", 3:10], method="spearman"), 2)
## UTOTHRS TENURE HRLYEARN AGE_12 EDUC ESTSIZE FIRMSIZE AGYOWNK
## UTOTHRS 1.00 0.11 0.18 0.11 -0.02 0.02 -0.06 -0.12
## TENURE 0.11 1.00 0.36 0.47 0.10 0.17 0.18 -0.12
## HRLYEARN 0.18 0.36 1.00 0.26 0.39 0.28 0.25 -0.26
## AGE_12 0.11 0.47 0.26 1.00 0.08 0.07 0.04 -0.02
## EDUC -0.02 0.10 0.39 0.08 1.00 0.18 0.18 -0.17
## ESTSIZE 0.02 0.17 0.28 0.07 0.18 1.00 0.61 -0.09
## FIRMSIZE -0.06 0.18 0.25 0.04 0.18 0.61 1.00 -0.07
## AGYOWNK -0.12 -0.12 -0.26 -0.02 -0.17 -0.09 -0.07 1.00
round(cor(ord.num[ord.num$SURVYEAR == 2009 & ord.num$SEX == "Female", 3:10], method="spearman"), 2)
## UTOTHRS TENURE HRLYEARN AGE_12 EDUC ESTSIZE FIRMSIZE AGYOWNK
## UTOTHRS 1.00 0.16 0.22 0.12 0.14 0.10 0.04 -0.05
## TENURE 0.16 1.00 0.42 0.51 0.11 0.20 0.16 -0.08
## HRLYEARN 0.22 0.42 1.00 0.27 0.48 0.34 0.26 -0.17
## AGE_12 0.12 0.51 0.27 1.00 0.05 0.09 0.04 0.05
## EDUC 0.14 0.11 0.48 0.05 1.00 0.16 0.13 -0.15
## ESTSIZE 0.10 0.20 0.34 0.09 0.16 1.00 0.55 -0.05
## FIRMSIZE 0.04 0.16 0.26 0.04 0.13 0.55 1.00 -0.04
## AGYOWNK -0.05 -0.08 -0.17 0.05 -0.15 -0.05 -0.04 1.00
round(cor(ord.num[ord.num$SURVYEAR == 2019 & ord.num$SEX == "Female", 3:10], method="spearman"), 2)
## UTOTHRS TENURE HRLYEARN AGE_12 EDUC ESTSIZE FIRMSIZE AGYOWNK
## UTOTHRS 1.00 0.13 0.21 0.09 0.14 0.11 0.04 -0.05
## TENURE 0.13 1.00 0.40 0.50 0.10 0.19 0.18 -0.07
## HRLYEARN 0.21 0.40 1.00 0.20 0.48 0.33 0.27 -0.20
## AGE_12 0.09 0.50 0.20 1.00 0.01 0.05 0.01 0.09
## EDUC 0.14 0.10 0.48 0.01 1.00 0.18 0.14 -0.19
## ESTSIZE 0.11 0.19 0.33 0.05 0.18 1.00 0.58 -0.06
## FIRMSIZE 0.04 0.18 0.27 0.01 0.14 0.58 1.00 -0.05
## AGYOWNK -0.05 -0.07 -0.20 0.09 -0.19 -0.06 -0.05 1.00
summary(data.all[num.vars])
## UTOTHRS TENURE HRLYEARN
## Min. : 0.40 Min. : 1.00 Min. : 2.00
## 1st Qu.:35.00 1st Qu.: 15.00 1st Qu.: 15.00
## Median :40.00 Median : 53.00 Median : 21.00
## Mean :36.23 Mean : 86.02 Mean : 24.37
## 3rd Qu.:40.00 3rd Qu.:143.00 3rd Qu.: 30.77
## Max. :99.00 Max. :240.00 Max. :115.38
# HW v. USUAL HOURS WORKED PER WEEK ############################################
# Males 2009
# Default par(mar=c(5.1, 4.1, 4.1, 2.1))
par(mar=c(4.5, 4.5, 1.0, 1.0))
plot(data.all.09male$UTOTHRS
, data.all.09male$HRLYEARN
, xlim = c(0,120)
, ylim = c(0,120)
, cex.lab = 1.8
, cex.axis = 1.8
, xlab="Usual Hours Worked per Week (UTOTHRS)"
, ylab="Hourly Wage (HRLYEARN)"
)
lines(lowess(data.all.09male$UTOTHRS, data.all.09male$HRLYEARN), col="red", lwd = 8)
# Males 2019
plot(data.all.19male$UTOTHRS
, data.all.19male$HRLYEARN
, xlim = c(0,120)
, ylim = c(0,120)
, cex.lab = 1.8
, cex.axis = 1.8
, xlab="Usual Hours Worked per Week (UTOTHRS)"
, ylab="Hourly Wage (HRLYEARN)"
)
lines(lowess(data.all.19male$UTOTHRS, data.all.19male$HRLYEARN), col="red", lwd = 8)
# Females 2009
plot(data.all.09fem$UTOTHRS
, data.all.09fem$HRLYEARN
, xlim = c(0,120)
, ylim = c(0,120)
, cex.lab = 1.8
, cex.axis = 1.8
, xlab="Usual Hours Worked per Week (UTOTHRS)"
, ylab="Hourly Wage (HRLYEARN)"
)
lines(lowess(data.all.09fem$UTOTHRS, data.all.09fem$HRLYEARN), col="red", lwd = 8)
# Females 2019
plot(data.all.19fem$UTOTHRS
, data.all.19fem$HRLYEARN
, xlim = c(0,120)
, ylim = c(0,120)
, cex.lab = 1.8
, cex.axis = 1.8
, xlab="Usual Hours Worked per Week (UTOTHRS)"
, ylab="Hourly Wage (HRLYEARN)"
)
lines(lowess(data.all.19fem$UTOTHRS, data.all.19fem$HRLYEARN), col="red", lwd = 8)
# HW v. TENURE #################################################################
# Males 2009
# Default par(mar=c(5.1, 4.1, 4.1, 2.1))
par(mar=c(4.5, 4.5, 1.0, 1.0))
plot(data.all.09male$TENURE
, data.all.09male$HRLYEARN
, xlim = c(0,240)
, ylim = c(0,240)
, cex.lab = 1.8
, cex.axis = 1.8
, xlab="Tenure in Months"
, ylab="Hourly Wage (HRLYEARN)"
)
lines(lowess(data.all.09male$TENURE, data.all.09male$HRLYEARN), col="red", lwd = 8)
# Males 2019
plot(data.all.19male$TENURE
, data.all.19male$HRLYEARN
, xlim = c(0,240)
, ylim = c(0,240)
, cex.lab = 1.8
, cex.axis = 1.8
, xlab="Tenure in Months"
, ylab="Hourly Wage (HRLYEARN)"
)
lines(lowess(data.all.19male$TENURE, data.all.19male$HRLYEARN), col="red", lwd = 8)
# Females 2009
plot(data.all.09fem$TENURE
, data.all.09fem$HRLYEARN
, xlim = c(0,240)
, ylim = c(0,240)
, cex.lab = 1.8
, cex.axis = 1.8
, xlab="Tenure in Months"
, ylab="Hourly Wage (HRLYEARN)"
)
lines(lowess(data.all.09fem$TENURE, data.all.09fem$HRLYEARN), col="red", lwd = 8)
# Females 2019
plot(data.all.19fem$TENURE
, data.all.19fem$HRLYEARN
, xlim = c(0,240)
, ylim = c(0,240)
, cex.lab = 1.8
, cex.axis = 1.8
, xlab="Tenure in Months"
, ylab="Hourly Wage (HRLYEARN)"
)
lines(lowess(data.all.19fem$TENURE, data.all.19fem$HRLYEARN), col="red", lwd = 8)
summary(data.all[ord.vars])
## AGE_12 EDUC ESTSIZE
## 45-49 :12952 0 to 8 years : 2003 <20 :36269
## 50-54 :12380 Some high school : 9783 20-99 :35799
## 40-44 :11782 High school graduate :21594 100-500:20604
## 35-39 :11475 Some postsecondary : 7400 >500 :12816
## 30-34 :11045 Postsecondary certificate or diploma:39814
## 25-29 :10937 Bachelor's degree :17378
## (Other):34917 Above bachelor's degree : 7516
## FIRMSIZE AGYOWNK
## <20 :20662 >6 :13510
## 20-99 :17463 6-12 :11724
## 100-500:15467 13-17 : 8270
## >500 :51896 18-24 : 6840
## >24|NC:65144
##
##
# HW v. AGE GROUPS #############################################################
# Males 2009
# Default par(mar=c(5.1, 4.1, 4.1, 2.1))
par(mar=c(8, 4.5, 1.0, 1.0))
plot(data.all.09male$AGE_12
, data.all.09male$HRLYEARN
, ylim = c(0,115)
, cex.lab = 1.8
, cex.axis = 1.8
, las = 2
, col = "slategray2"
, xlab=""
, ylab="Hourly Wage"
)
mtext("Age Groups", side=1, line=5.8, cex =1.8)
# Males 2019
# Default par(mar=c(5.1, 4.1, 4.1, 2.1))
par(mar=c(8, 4.5, 1.0, 1.0))
plot(data.all.19male$AGE_12
, data.all.19male$HRLYEARN
, ylim = c(0,115)
, cex.lab = 1.8
, cex.axis = 1.8
, las = 2
, col = "royalblue"
, xlab=""
, ylab="Hourly Wage"
)
mtext("Age Groups", side=1, line=5.8, cex =1.8)
# Females 2009
# Default par(mar=c(5.1, 4.1, 4.1, 2.1))
par(mar=c(8, 4.5, 1.0, 1.0))
plot(data.all.09fem$AGE_12
, data.all.09fem$HRLYEARN
, ylim = c(0,115)
, cex.lab = 1.8
, cex.axis = 1.8
, las = 2
, col = "moccasin"
, xlab=""
, ylab="Hourly Wage"
)
mtext("Age Groups", side=1, line=5.8, cex =1.8)
# Females 2019
# Default par(mar=c(5.1, 4.1, 4.1, 2.1))
par(mar=c(8, 4.5, 1.0, 1.0))
plot(data.all.19fem$AGE_12
, data.all.19fem$HRLYEARN
, ylim = c(0,115)
, cex.lab = 1.8
, cex.axis = 1.8
, las = 2
, col = "darkorange"
, xlab=""
, ylab="Hourly Wage"
)
mtext("Age Groups", side=1, line=5.8, cex =1.8)
# HW v. EDUCATION ##############################################################
# Males 2009
summary(data.all$EDUCshort)
## No.HS Some.HS HS.grad Some.Post Post.cert Bachelor Above.B
## 2003 9783 21594 7400 39814 17378 7516
# Default par(mar=c(5.1, 4.1, 4.1, 2.1))
par(mar=c(10.5, 5.5, 1.0, 1.0))
plot(data.all.09male$EDUCshort
, data.all.09male$HRLYEARN
, ylim = c(0,115)
, cex.lab = 1.8
, cex.axis = 1.8
, las = 2
, col = "slategray2"
, xlab=""
, ylab=""
)
mtext("Highest Education Attainment", side=1, line=9, cex =1.8)
mtext("Hourly Wage", side=2, line=3.6, cex =1.8)
# Males 2019
# Default par(mar=c(5.1, 4.1, 4.1, 2.1))
par(mar=c(10.5, 5.5, 1.0, 1.0))
plot(data.all.19male$EDUCshort
, data.all.19male$HRLYEARN
, ylim = c(0,115)
, cex.lab = 1.8
, cex.axis = 1.8
, las = 2
, col = "royalblue"
, xlab=""
, ylab=""
)
mtext("Highest Education Attainment", side=1, line=9, cex =1.8)
mtext("Hourly Wage", side=2, line=3.6, cex =1.8)
# Females 2009
# Default par(mar=c(5.1, 4.1, 4.1, 2.1))
par(mar=c(10.5, 5.5, 1.0, 1.0))
plot(data.all.09fem$EDUCshort
, data.all.09fem$HRLYEARN
, ylim = c(0,115)
, cex.lab = 1.8
, cex.axis = 1.8
, las = 2
, col = "moccasin"
, xlab=""
, ylab=""
)
mtext("Highest Education Attainment", side=1, line=9, cex =1.8)
mtext("Hourly Wage", side=2, line=3.6, cex =1.8)
# Females 2019
# Default par(mar=c(5.1, 4.1, 4.1, 2.1))
par(mar=c(10.5, 5.5, 1.0, 1.0))
plot(data.all.19fem$EDUCshort
, data.all.19fem$HRLYEARN
, ylim = c(0,115)
, cex.lab = 1.8
, cex.axis = 1.8
, las = 2
, col = "darkorange"
, xlab=""
, ylab=""
)
mtext("Highest Education Attainment", side=1, line=9, cex =1.8)
mtext("Hourly Wage", side=2, line=3.6, cex =1.8)
# HW v. ESTABLISHMENT SIZE #####################################################
# Males 2009
# Default par(mar=c(5.1, 4.1, 4.1, 2.1))
par(mar=c(7.5, 5.5, 1.0, 18.0))
plot(data.all.09male$ESTSIZE
, data.all.09male$HRLYEARN
, ylim = c(0,115)
, cex.lab = 1.8
, cex.axis = 1.8
, las = 2
, col = "slategray2"
, xlab=""
, ylab=""
)
# mtext("Establishment Size", side=1, line=7.5, cex =1.8)
# mtext("(Number of Employees)", side=1, line=8.8, cex =1.4)
mtext("Hourly Wage", side=2, line=3.6, cex =1.8)
# Males 2019
# Default par(mar=c(5.1, 4.1, 4.1, 2.1))
par(mar=c(7.5, 5.5, 1.0, 18.0))
plot(data.all.19male$ESTSIZE
, data.all.19male$HRLYEARN
, ylim = c(0,115)
, cex.lab = 1.8
, cex.axis = 1.8
, las = 2
, col = "royalblue"
, xlab=""
, ylab=""
)
# mtext("Establishment Size", side=1, line=7.5, cex =1.8)
# mtext("(Number of Employees)", side=1, line=8.8, cex =1.4)
# mtext("Hourly Wage", side=2, line=3.6, cex =1.8)
# Females 2009
# Default par(mar=c(5.1, 4.1, 4.1, 2.1))
par(mar=c(7.5, 5.5, 1.0, 18.0))
plot(data.all.09fem$ESTSIZE
, data.all.09fem$HRLYEARN
, ylim = c(0,115)
, cex.lab = 1.8
, cex.axis = 1.8
, las = 2
, col = "moccasin"
, xlab=""
, ylab=""
)
# mtext("Establishment Size", side=1, line=7.5, cex =1.8)
# mtext("(Number of Employees)", side=1, line=8.8, cex =1.4)
# mtext("Hourly Wage", side=2, line=3.6, cex =1.8)
# Females 2019
# Default par(mar=c(5.1, 4.1, 4.1, 2.1))
par(mar=c(7.5, 5.5, 1.0, 18.0))
plot(data.all.09fem$ESTSIZE
, data.all.09fem$HRLYEARN
, ylim = c(0,115)
, cex.lab = 1.8
, cex.axis = 1.8
, las = 2
, col = "darkorange"
, xlab=""
, ylab=""
)
# mtext("Establishment Size", side=1, line=7.5, cex =1.8)
# mtext("(Number of Employees)", side=1, line=8.8, cex =1.4)
# mtext("Hourly Wage", side=2, line=3.6, cex =1.8)
# HW v. FIRM SIZE #####################################################
# Males 2009
# Default par(mar=c(5.1, 4.1, 4.1, 2.1))
par(mar=c(7.5, 5.5, 1.0, 18.0))
plot(data.all.09male$FIRMSIZE
, data.all.09male$HRLYEARN
, ylim = c(0,115)
, cex.lab = 1.8
, cex.axis = 1.8
, las = 2
, col = "slategray2"
, xlab=""
, ylab=""
)
# mtext("Firm Size", side=1, line=7.5, cex =1.8)
# mtext("(Number of Employees)", side=1, line=8.8, cex =1.4)
mtext("Hourly Wage", side=2, line=3.6, cex =1.8)
# Males 2019
# Default par(mar=c(5.1, 4.1, 4.1, 2.1))
par(mar=c(7.5, 5.5, 1.0, 18.0))
plot(data.all.19male$FIRMSIZE
, data.all.19male$HRLYEARN
, ylim = c(0,115)
, cex.lab = 1.8
, cex.axis = 1.8
, las = 2
, col = "royalblue"
, xlab=""
, ylab=""
)
# mtext("Firm Size", side=1, line=7.5, cex =1.8)
# mtext("(Number of Employees)", side=1, line=8.8, cex =1.4)
# mtext("Hourly Wage", side=2, line=3.6, cex =1.8)
# Females 2009
# Default par(mar=c(5.1, 4.1, 4.1, 2.1))
par(mar=c(7.5, 5.5, 1.0, 18.0))
plot(data.all.09fem$FIRMSIZE
, data.all.09fem$HRLYEARN
, ylim = c(0,115)
, cex.lab = 1.8
, cex.axis = 1.8
, las = 2
, col = "moccasin"
, xlab=""
, ylab=""
)
# mtext("Firm Size", side=1, line=7.5, cex =1.8)
# mtext("(Number of Employees)", side=1, line=8.8, cex =1.4)
# mtext("Hourly Wage", side=2, line=3.6, cex =1.8)
# Females 2019
# Default par(mar=c(5.1, 4.1, 4.1, 2.1))
par(mar=c(7.5, 5.5, 1.0, 18.0))
plot(data.all.09fem$FIRMSIZE
, data.all.09fem$HRLYEARN
, ylim = c(0,115)
, cex.lab = 1.8
, cex.axis = 1.8
, las = 2
, col = "darkorange"
, xlab=""
, ylab=""
)
# mtext("Firm Size", side=1, line=7.5, cex =1.8)
# mtext("(Number of Employees)", side=1, line=8.8, cex =1.4)
# mtext("Hourly Wage", side=2, line=3.6, cex =1.8)
# HW v. AGE OF YOUNGEST CHILD ##################################################
summary(data.all[ord.vars])
## AGE_12 EDUC ESTSIZE
## 45-49 :12952 0 to 8 years : 2003 <20 :36269
## 50-54 :12380 Some high school : 9783 20-99 :35799
## 40-44 :11782 High school graduate :21594 100-500:20604
## 35-39 :11475 Some postsecondary : 7400 >500 :12816
## 30-34 :11045 Postsecondary certificate or diploma:39814
## 25-29 :10937 Bachelor's degree :17378
## (Other):34917 Above bachelor's degree : 7516
## FIRMSIZE AGYOWNK
## <20 :20662 >6 :13510
## 20-99 :17463 6-12 :11724
## 100-500:15467 13-17 : 8270
## >500 :51896 18-24 : 6840
## >24|NC:65144
##
##
# Males 2009
# Default par(mar=c(5.1, 4.1, 4.1, 2.1))
par(mar=c(7.5, 5.5, 1.0, 18.0))
plot(data.all.09male$AGYOWNK
, data.all.09male$HRLYEARN
, ylim = c(0,115)
, cex.lab = 1.8
, cex.axis = 1.8
, las = 2
, col = "slategray2"
, xlab=""
, ylab=""
)
# mtext("Age of Youngest Child", side=1, line=7.5, cex =1.8)
mtext("Hourly Wage", side=2, line=3.6, cex =1.8)
# Males 2019
# Default par(mar=c(5.1, 4.1, 4.1, 2.1))
par(mar=c(7.5, 5.5, 1.0, 18.0))
plot(data.all.19male$AGYOWNK
, data.all.19male$HRLYEARN
, ylim = c(0,115)
, cex.lab = 1.8
, cex.axis = 1.8
, las = 2
, col = "royalblue"
, xlab=""
, ylab=""
)
# mtext("Age of Youngest Child", side=1, line=7.5, cex =1.8)
# mtext("Hourly Wage", side=2, line=3.6, cex =1.8)
# Females 2009
# Default par(mar=c(5.1, 4.1, 4.1, 2.1))
par(mar=c(7.5, 5.5, 1.0, 18.0))
plot(data.all.09fem$AGYOWNK
, data.all.09fem$HRLYEARN
, ylim = c(0,115)
, cex.lab = 1.8
, cex.axis = 1.8
, las = 2
, col = "moccasin"
, xlab=""
, ylab=""
)
# mtext("Age of Youngest Child", side=1, line=7.5, cex =1.8)
# mtext("Hourly Wage", side=2, line=3.6, cex =1.8)
# Females 2019
# Default par(mar=c(5.1, 4.1, 4.1, 2.1))
par(mar=c(7.5, 5.5, 1.0, 18.0))
plot(data.all.09fem$AGYOWNK
, data.all.09fem$HRLYEARN
, ylim = c(0,115)
, cex.lab = 1.8
, cex.axis = 1.8
, las = 2
, col = "darkorange"
, xlab=""
, ylab=""
)
# mtext("Age of Youngest Child", side=1, line=7.5, cex =1.8)
# mtext("Hourly Wage", side=2, line=3.6, cex =1.8)
# HW v. PROVINCE ###############################################################
summary(data.all$PROV)
## NL PEI NS NB QC ON MB SK AB BC
## 3515 2871 5197 5311 18664 29816 9332 7513 11177 12092
# Males 2009
# Default par(mar=c(5.1, 4.1, 4.1, 2.1))
par(mar=c(4.5, 5.5, 1.0, 1.0))
prov.order <- with(data.all.09male, reorder(PROV, HRLYEARN, median))
plot(prov.order
, data.all.09male$HRLYEARN
, ylim = c(0,115)
, cex.lab = 1.8
, cex.axis = 1.8
, las = 2
, col = "slategray2"
, xlab=""
, ylab=""
)
# mtext("Province", side=1, line=7.5, cex =1.8)
mtext("Hourly Wage", side=2, line=3.6, cex =1.8)
round(tapply(data.all.09male$HRLYEARN, INDEX=data.all.09male$PROV, FUN=median), 2)
## NL PEI NS NB QC ON MB SK AB BC
## 18.95 15.80 18.48 17.50 19.98 21.54 18.75 21.00 24.23 23.44
# Males 2019
# Default par(mar=c(5.1, 4.1, 4.1, 2.1))
par(mar=c(4.5, 5.5, 1.0, 1.0))
prov.order <- with(data.all.19male, reorder(PROV, HRLYEARN, median))
plot(prov.order
, data.all.19male$HRLYEARN
, ylim = c(0,115)
, cex.lab = 1.8
, cex.axis = 1.8
, las = 2
, col = "royalblue"
, xlab=""
, ylab=""
)
# mtext("Province", side=1, line=7.5, cex =1.8)
mtext("Hourly Wage", side=2, line=3.6, cex =1.8)
round(tapply(data.all.19male$HRLYEARN, INDEX=data.all.19male$PROV, FUN=median), 2)
## NL PEI NS NB QC ON MB SK AB BC
## 25.00 20.00 22.12 21.00 24.90 26.00 24.04 27.00 31.77 28.50
# Females 2009
# Default par(mar=c(5.1, 4.1, 4.1, 2.1))
par(mar=c(4.5, 5.5, 1.0, 1.0))
prov.order <- with(data.all.09fem, reorder(PROV, HRLYEARN, median))
plot(prov.order
, data.all.09fem$HRLYEARN
, ylim = c(0,115)
, cex.lab = 1.8
, cex.axis = 1.8
, las = 2
, col = "moccasin"
, xlab=""
, ylab=""
)
# mtext("Province", side=1, line=7.5, cex =1.8)
mtext("Hourly Wage", side=2, line=3.6, cex =1.8)
round(tapply(data.all.09fem$HRLYEARN, INDEX=data.all.09fem$PROV, FUN=median), 2)
## NL PEI NS NB QC ON MB SK AB BC
## 15.00 14.42 15.21 15.00 17.00 18.00 16.31 17.62 18.75 18.20
# Females 2019
# Default par(mar=c(5.1, 4.1, 4.1, 2.1))
par(mar=c(4.5, 5.5, 1.0, 1.0))
prov.order <- with(data.all.19fem, reorder(PROV, HRLYEARN, median))
plot(prov.order
, data.all.19fem$HRLYEARN
, ylim = c(0,115)
, cex.lab = 1.8
, cex.axis = 1.8
, las = 2
, col = "darkorange"
, xlab=""
, ylab=""
)
# mtext("Province", side=1, line=7.5, cex =1.8)
mtext("Hourly Wage", side=2, line=3.6, cex =1.8)
round(tapply(data.all.19fem$HRLYEARN, INDEX=data.all.19fem$PROV, FUN=median), 2)
## NL PEI NS NB QC ON MB SK AB BC
## 20.34 20.00 19.00 19.98 22.00 23.08 20.60 23.00 24.00 23.00
# HW v. MARITAL STATUS #########################################################
summary(data.all$MARSTAT)
## Married Common-law Widowed Separated Divorced Single, NM
## 50075 16182 1175 2986 4835 30235
# Males 2009
# Default par(mar=c(5.1, 4.1, 4.1, 2.1))
par(mar=c(10, 5.5, 1.0, 1.0))
ms.order <- with(data.all.09male, reorder(MARSTAT, HRLYEARN, median))
plot(ms.order
, data.all.09male$HRLYEARN
, ylim = c(0,115)
, cex.lab = 1.8
, cex.axis = 1.8
, las = 2
, col = "slategray2"
, xlab=""
, ylab=""
)
# mtext("Marital Status", side=1, line=7.5, cex =1.8)
mtext("Hourly Wage", side=2, line=3.6, cex =1.8)
round(tapply(data.all.09male$HRLYEARN, INDEX=data.all.09male$MARSTAT, FUN=median), 2)
## Married Common-law Widowed Separated Divorced Single, NM
## 24.00 21.63 20.00 22.00 22.00 15.00
# Males 2019
# Default par(mar=c(5.1, 4.1, 4.1, 2.1))
par(mar=c(10, 5.5, 1.0, 1.0))
ms.order <- with(data.all.19male, reorder(MARSTAT, HRLYEARN, median))
plot(ms.order
, data.all.19male$HRLYEARN
, ylim = c(0,115)
, cex.lab = 1.8
, cex.axis = 1.8
, las = 2
, col = "royalblue"
, xlab=""
, ylab=""
)
# mtext("Marital Status", side=1, line=7.5, cex =1.8)
mtext("Hourly Wage", side=2, line=3.6, cex =1.8)
round(tapply(data.all.19male$HRLYEARN, INDEX=data.all.19male$MARSTAT, FUN=median), 2)
## Married Common-law Widowed Separated Divorced Single, NM
## 30.00 27.20 23.17 28.00 27.50 19.69
# Females 2009
# Default par(mar=c(5.1, 4.1, 4.1, 2.1))
par(mar=c(10, 5.5, 1.0, 1.0))
ms.order <- with(data.all.09fem, reorder(MARSTAT, HRLYEARN, median))
plot(ms.order
, data.all.09fem$HRLYEARN
, ylim = c(0,115)
, cex.lab = 1.8
, cex.axis = 1.8
, las = 2
, col = "moccasin"
, xlab=""
, ylab=""
)
# mtext("Marital Status", side=1, line=7.5, cex =1.8)
mtext("Hourly Wage", side=2, line=3.6, cex =1.8)
round(tapply(data.all.09fem$HRLYEARN, INDEX=data.all.09fem$MARSTAT, FUN=median), 2)
## Married Common-law Widowed Separated Divorced Single, NM
## 19.23 17.00 16.45 18.00 19.23 12.51
# Females 2019
# Default par(mar=c(5.1, 4.1, 4.1, 2.1))
par(mar=c(10, 5.5, 1.0, 1.0))
ms.order <- with(data.all.19fem, reorder(MARSTAT, HRLYEARN, median))
plot(ms.order
, data.all.19fem$HRLYEARN
, ylim = c(0,115)
, cex.lab = 1.8
, cex.axis = 1.8
, las = 2
, col = "darkorange"
, xlab=""
, ylab=""
)
# mtext("Marital Status", side=1, line=7.5, cex =1.8)
mtext("Hourly Wage", side=2, line=3.6, cex =1.8)
round(tapply(data.all.19fem$HRLYEARN, INDEX=data.all.19fem$MARSTAT, FUN=median), 2)
## Married Common-law Widowed Separated Divorced Single, NM
## 24.92 23.00 20.00 24.00 22.67 17.95
# HW v. FULL TIME OR PART TIME #################################################
summary(data.all$FTPTMAIN)
## Full-time Part-time
## 86157 19331
# Males 2009
# Default par(mar=c(5.1, 4.1, 4.1, 2.1))
par(mar=c(7.5, 5.5, 1.0, 18.0))
ftpt.order <- with(data.all.09male, reorder(FTPTMAIN, HRLYEARN, median))
plot(ftpt.order
, data.all.09male$HRLYEARN
, ylim = c(0,115)
, cex.lab = 1.8
, cex.axis = 1.8
, las = 2
, col = "slategray2"
, xlab=""
, ylab=""
)
# mtext("Full Time or Part Time Status", side=1, line=7.5, cex =1.8)
mtext("Hourly Wage", side=2, line=3.6, cex =1.8)
# Males 2019
# Default par(mar=c(5.1, 4.1, 4.1, 2.1))
par(mar=c(7.5, 5.5, 1.0, 18.0))
ftpt.order <- with(data.all.19male, reorder(FTPTMAIN, HRLYEARN, median))
plot(ftpt.order
, data.all.19male$HRLYEARN
, ylim = c(0,115)
, cex.lab = 1.8
, cex.axis = 1.8
, las = 2
, col = "royalblue"
, xlab=""
, ylab=""
)
# mtext("Full Time or Part Time Status", side=1, line=7.5, cex =1.8)
# mtext("Hourly Wage", side=2, line=3.6, cex =1.8)
# Females 2009
# Default par(mar=c(5.1, 4.1, 4.1, 2.1))
par(mar=c(7.5, 5.5, 1.0, 18.0))
ftpt.order <- with(data.all.09fem, reorder(FTPTMAIN, HRLYEARN, median))
plot(ftpt.order
, data.all.09fem$HRLYEARN
, ylim = c(0,115)
, cex.lab = 1.8
, cex.axis = 1.8
, las = 2
, col = "moccasin"
, xlab=""
, ylab=""
)
# mtext("Full Time or Part Time Status", side=1, line=7.5, cex =1.8)
# mtext("Hourly Wage", side=2, line=3.6, cex =1.8)
# Females 2019
# Default par(mar=c(5.1, 4.1, 4.1, 2.1))
par(mar=c(7.5, 5.5, 1.0, 18.0))
ftpt.order <- with(data.all.19fem, reorder(FTPTMAIN, HRLYEARN, median))
plot(ftpt.order
, data.all.19fem$HRLYEARN
, ylim = c(0,115)
, cex.lab = 1.8
, cex.axis = 1.8
, las = 2
, col = "darkorange"
, xlab=""
, ylab=""
)
# mtext("Full Time or Part Time Status", side=1, line=7.5, cex =1.8)
mtext("Hourly Wage", side=2, line=3.6, cex =1.8)
# Medians ALL GROUPS
round(tapply(data.all.09male$HRLYEARN, INDEX=data.all.09male$FTPTMAIN, FUN=median), 2)
## Full-time Part-time
## 22.0 10.5
round(tapply(data.all.19male$HRLYEARN, INDEX=data.all.19male$FTPTMAIN, FUN=median), 2)
## Full-time Part-time
## 27.46 15.00
round(tapply(data.all.09fem$HRLYEARN, INDEX=data.all.09fem$FTPTMAIN, FUN=median), 2)
## Full-time Part-time
## 19 12
round(tapply(data.all.19fem$HRLYEARN, INDEX=data.all.19fem$FTPTMAIN, FUN=median), 2)
## Full-time Part-time
## 24.06 16.00
# HW v. JOB PERMANENCY #################################################
summary(data.all$FTPTMAIN)
## Full-time Part-time
## 86157 19331
# Males 2009
# Default par(mar=c(5.1, 4.1, 4.1, 2.1))
par(mar=c(12, 5.5, 1.0, 18.0))
per.order <- with(data.all.09male, reorder(PERMTEMP, HRLYEARN, median))
plot(per.order
, data.all.09male$HRLYEARN
, ylim = c(0,115)
, cex.lab = 1.8
, cex.axis = 1.8
, las = 2
, col = "slategray2"
, xlab=""
, ylab=""
)
# mtext("Job Permanency", side=1, line=7.5, cex =1.8)
mtext("Hourly Wage", side=2, line=3.6, cex =1.8)
# Males 2019
# Default par(mar=c(5.1, 4.1, 4.1, 2.1))
par(mar=c(7.5, 5.5, 1.0, 18.0))
per.order <- with(data.all.19male, reorder(PERMTEMP, HRLYEARN, median))
plot(per.order
, data.all.19male$HRLYEARN
, ylim = c(0,115)
, cex.lab = 1.8
, cex.axis = 1.8
, las = 2
, col = "royalblue"
, xlab=""
, ylab=""
)
# mtext("Full Time or Part Time Status", side=1, line=7.5, cex =1.8)
# mtext("Hourly Wage", side=2, line=3.6, cex =1.8)
# Females 2009
# Default par(mar=c(5.1, 4.1, 4.1, 2.1))
par(mar=c(7.5, 5.5, 1.0, 18.0))
per.order <- with(data.all.09fem, reorder(PERMTEMP, HRLYEARN, median))
plot(per.order
, data.all.09fem$HRLYEARN
, ylim = c(0,115)
, cex.lab = 1.8
, cex.axis = 1.8
, las = 2
, col = "moccasin"
, xlab=""
, ylab=""
)
# mtext("Full Time or Part Time Status", side=1, line=7.5, cex =1.8)
# mtext("Hourly Wage", side=2, line=3.6, cex =1.8)
# Females 2019
# Default par(mar=c(5.1, 4.1, 4.1, 2.1))
par(mar=c(7.5, 5.5, 1.0, 18.0))
per.order <- with(data.all.19fem, reorder(PERMTEMP, HRLYEARN, median))
plot(per.order
, data.all.19fem$HRLYEARN
, ylim = c(0,115)
, cex.lab = 1.8
, cex.axis = 1.8
, las = 2
, col = "darkorange"
, xlab=""
, ylab=""
)
# mtext("Full Time or Part Time Status", side=1, line=7.5, cex =1.8)
# mtext("Hourly Wage", side=2, line=3.6, cex =1.8)
# Medians ALL GROUPS
round(tapply(data.all.09male$HRLYEARN, INDEX=data.all.09male$PERMTEMP, FUN=median), 2)
## Permanent Temp. season Temp. contract Temp. casual
## 21.63 15.00 19.23 12.00
round(tapply(data.all.19male$HRLYEARN, INDEX=data.all.19male$PERMTEMP, FUN=median), 2)
## Permanent Temp. season Temp. contract Temp. casual
## 26.92 19.50 24.93 15.54
round(tapply(data.all.09fem$HRLYEARN, INDEX=data.all.09fem$PERMTEMP, FUN=median), 2)
## Permanent Temp. season Temp. contract Temp. casual
## 17.85 11.00 17.21 12.50
round(tapply(data.all.19fem$HRLYEARN, INDEX=data.all.19fem$PERMTEMP, FUN=median), 2)
## Permanent Temp. season Temp. contract Temp. casual
## 23.00 15.00 22.44 16.25
# HW v. ECONOMIC FAMILY ########################################################
summary(data.all$EFAMTYPE)
## Ind HWDENC HWDE17 HWDE24 HWSHNC HWSH17 HWSH24 HWSWNC HWSW17 HWSW24 HWNENC
## 16842 22136 28911 7892 4053 4009 986 3423 1355 712 749
## HWNE17 HWNE24 SPE17 SPE24 SPN17 SPN24 Other
## 109 181 4991 2024 267 189 6659
# Males 2009
# Default par(mar=c(5.1, 4.1, 4.1, 2.1))
par(mar=c(6, 5.5, 1.0, 1.0))
efa.order <- with(data.all.09male, reorder(EFAMTYPE, HRLYEARN, median))
plot(efa.order
, data.all.09male$HRLYEARN
, ylim = c(0,115)
, cex.lab = 1.8
, cex.axis = 1.3
, las = 2
, col = "slategray2"
, xlab=""
, ylab=""
)
# mtext("Type of Economic Family", side=1, line=7.5, cex =1.8)
mtext("Hourly Wage", side=2, line=3.6, cex =1.8)
# Males 2019
# Default par(mar=c(5.1, 4.1, 4.1, 2.1))
efa.order <- with(data.all.19male, reorder(EFAMTYPE, HRLYEARN, median))
plot(efa.order
, data.all.19male$HRLYEARN
, ylim = c(0,115)
, cex.lab = 1.8
, cex.axis = 1.8
, las = 2
, col = "royalblue"
, xlab=""
, ylab=""
)
# mtext("Type of Economic Family", side=1, line=7.5, cex =1.8)
# mtext("Hourly Wage", side=2, line=3.6, cex =1.8)
# Females 2009
# Default par(mar=c(5.1, 4.1, 4.1, 2.1))
efa.order <- with(data.all.09fem, reorder(EFAMTYPE, HRLYEARN, median))
plot(efa.order
, data.all.09fem$HRLYEARN
, ylim = c(0,115)
, cex.lab = 1.8
, cex.axis = 1.8
, las = 2
, col = "moccasin"
, xlab=""
, ylab=""
)
# mtext("Type of Economic Family", side=1, line=7.5, cex =1.8)
# mtext("Hourly Wage", side=2, line=3.6, cex =1.8)
# Females 2019
# Default par(mar=c(5.1, 4.1, 4.1, 2.1))
efa.order <- with(data.all.19fem, reorder(EFAMTYPE, HRLYEARN, median))
plot(efa.order
, data.all.19fem$HRLYEARN
, ylim = c(0,115)
, cex.lab = 1.8
, cex.axis = 1.8
, las = 2
, col = "darkorange"
, xlab=""
, ylab=""
)
# mtext("Type of Economic Family", side=1, line=7.5, cex =1.8)
# mtext("Hourly Wage", side=2, line=3.6, cex =1.8)
# Medians ALL GROUPS
round(tapply(data.all.09male$HRLYEARN, INDEX=data.all.09male$EFAMTYPE, FUN=median), 2)
## Ind HWDENC HWDE17 HWDE24 HWSHNC HWSH17 HWSH24 HWSWNC HWSW17 HWSW24 HWNENC
## 20.00 22.00 23.00 18.86 20.48 21.25 19.56 18.00 10.00 12.00 16.86
## HWNE17 HWNE24 SPE17 SPE24 SPN17 SPN24 Other
## 10.00 12.00 17.52 14.96 10.00 12.00 18.00
round(tapply(data.all.19male$HRLYEARN, INDEX=data.all.19male$EFAMTYPE, FUN=median), 2)
## Ind HWDENC HWDE17 HWDE24 HWSHNC HWSH17 HWSH24 HWSWNC HWSW17 HWSW24 HWNENC
## 24.75 27.00 29.74 24.00 25.26 28.85 25.00 21.50 13.85 17.00 21.37
## HWNE17 HWNE24 SPE17 SPE24 SPN17 SPN24 Other
## 14.50 19.00 24.00 18.50 15.00 15.07 21.00
round(tapply(data.all.09fem$HRLYEARN, INDEX=data.all.09fem$EFAMTYPE, FUN=median), 2)
## Ind HWDENC HWDE17 HWDE24 HWSHNC HWSH17 HWSH24 HWSWNC HWSW17 HWSW24 HWNENC
## 17.80 18.40 18.00 16.00 14.75 9.60 10.50 17.00 16.00 15.71 16.00
## HWNE17 HWNE24 SPE17 SPE24 SPN17 SPN24 Other
## 9.50 10.00 16.05 15.68 9.38 10.00 16.00
round(tapply(data.all.19fem$HRLYEARN, INDEX=data.all.19fem$EFAMTYPE, FUN=median), 2)
## Ind HWDENC HWDE17 HWDE24 HWSHNC HWSH17 HWSH24 HWSWNC HWSW17 HWSW24 HWNENC
## 22.20 23.00 24.50 21.00 20.00 14.13 15.00 21.00 20.00 21.20 21.24
## HWNE17 HWNE24 SPE17 SPE24 SPN17 SPN24 Other
## 13.82 15.00 20.00 19.00 14.36 15.50 19.50
Highest hourly wages by year and gender, among sectors, industries, occupations
# RESULTS BY GROUP #############################################################
# INDUSTRY #####################################################################
# Males 2009
ind.order <- with(data.all.09male, reorder(NAICS_18short, HRLYEARN, median))
par(mar=c(15,5,2,1))
wage.ind.09male <- boxplot(HRLYEARN ~ ind.order
, data = data.all.09male
, boxwex = 0.4
, ylim = c(0, 120)
, las = 2
, cex.axis = 1.0
, col = "slategray2"
, xlab = ""
, ylab = "Hourly Wages (HRLYEARN)"
, main = "Males 2009 - Hourly Wage by Industry")
mtext("Industry (NAICS_18)", side=1, line=4.3)
rownames(wage.ind.09male$stats) <- c("Lower Fence", "Q1", "Median", "Q3", "Upper Fence")
colnames(wage.ind.09male$stats) <- c(levels(ind.order))
wage.ind.09male$stats # Boxplot Summary
## AcFood Agri Rtail Mngt Other Info ManuN Whole Trans Health
## Lower Fence 5.00 2.14 3.79 3.33 3.13 3.380 5.45 4.25 3.210 3.300
## Q1 9.35 10.00 10.00 11.00 13.70 13.000 15.00 15.00 16.000 16.465
## Median 11.00 13.55 14.00 14.00 18.90 19.815 20.00 20.00 20.510 21.450
## Q3 15.00 17.31 20.14 20.00 25.00 28.850 28.00 27.47 25.985 31.000
## Upper Fence 23.08 28.00 35.20 33.00 41.83 51.920 47.16 46.15 40.870 52.200
## Const ManuD Finan Fores Educa ProSc PubAd Utils
## Lower Fence 5.10 3.48 3.75 4.44 3.610 5.00 3.13 5.13
## Q1 17.00 17.00 15.34 19.78 19.700 20.19 22.03 23.50
## Median 22.00 22.00 22.00 28.00 28.280 28.29 30.00 30.64
## Q3 29.87 29.51 33.65 34.62 37.555 38.74 37.91 38.00
## Upper Fence 49.15 48.08 60.00 55.77 64.100 65.93 61.54 58.24
wage.ind.09male$n # Sample size
## [1] 1429 497 2994 992 929 1106 1776 1258 1860 1067 3209 2605 971 1330 1460
## [16] 1086 1880 493
table(wage.ind.09male$group) # Total Outliers
##
## 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18
## 80 14 136 64 33 23 60 49 96 29 41 76 30 49 19 38 39 10
kruskal.test(HRLYEARN ~ NAICS_18short, data = data.all.09male) # Diff. among groups
##
## Kruskal-Wallis rank sum test
##
## data: HRLYEARN by NAICS_18short
## Kruskal-Wallis chi-squared = 6107.8, df = 17, p-value < 2.2e-16
# dunnTest(HRLYEARN ~ NAICS_18short, data = data.all.09male, method = "bonferroni") # Post Hoc
# Males 2019
ind.order <- with(data.all.19male, reorder(NAICS_18short, HRLYEARN, median))
par(mar=c(15,5,2,1))
wage.ind.19male <- boxplot(HRLYEARN ~ ind.order
, data = data.all.19male
, boxwex = 0.4
, ylim = c(0, 120)
, las = 2
, cex.axis = 1.0
, col = "royalblue"
, xlab = ""
, ylab = "Hourly Wages (HRLYEARN)"
, main = "Males 2019 - Hourly Wage by Industry")
mtext("Industry (NAICS_18)", side=1, line=4.3)
rownames(wage.ind.19male$stats) <- c("Lower Fence", "Q1", "Median", "Q3", "Upper Fence")
colnames(wage.ind.19male$stats) <- c(levels(ind.order))
wage.ind.19male$stats # Boxplot Summary
## AcFood Rtail Agri Mngt Info Other ManuN Trans ManuD Whole
## Lower Fence 6.92 3.040 3.30 4.56 4.62 3.000 3.53 5.00 5.29 9.62
## Q1 13.15 14.000 15.00 15.00 16.00 18.000 18.50 19.35 20.00 20.00
## Median 15.00 16.750 18.47 18.50 23.05 23.080 25.00 25.00 26.00 26.00
## Q3 17.61 24.855 23.08 25.00 34.07 30.965 34.00 32.00 35.00 36.00
## Upper Fence 24.18 41.080 35.00 40.00 60.22 50.000 56.54 50.96 57.50 58.50
## Health Const Finan ProSc Educa Fores PubAd Utils
## Lower Fence 4.81 8.170 3.53 5.77 3.25 3.21 5.13 14.000
## Q1 20.00 22.000 21.00 25.00 24.22 27.78 28.00 35.000
## Median 26.25 28.745 29.77 35.00 36.06 37.00 37.00 43.475
## Q3 37.90 37.000 42.31 46.67 48.08 49.04 47.00 52.000
## Upper Fence 63.37 58.000 72.92 78.85 82.05 80.77 75.00 76.920
wage.ind.19male$n # Sample size
## [1] 1336 2792 457 955 981 819 1489 1872 2207 1108 1205 3264 1014 1326 1306
## [16] 1242 1746 380
table(wage.ind.19male$group) # Total Outliers
##
## 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18
## 118 155 33 53 21 36 61 71 66 40 29 60 39 37 19 17 18 7
kruskal.test(HRLYEARN ~ NAICS_18short, data = data.all.19male) # Diff. among groups
##
## Kruskal-Wallis rank sum test
##
## data: HRLYEARN by NAICS_18short
## Kruskal-Wallis chi-squared = 6604, df = 17, p-value < 2.2e-16
# dunnTest(HRLYEARN ~ NAICS_18short, data = data.all.19male, method = "bonferroni") # Post Hoc
# Females 2009
ind.order <- with(data.all.09fem, reorder(NAICS_18short, HRLYEARN, median))
par(mar=c(15,5,2,1))
wage.ind.09fem <- boxplot(HRLYEARN ~ ind.order
, data = data.all.09fem
, boxwex = 0.4
, ylim = c(0, 120)
, las = 2
, cex.axis = 1.0
, col = "moccasin"
, xlab = ""
, ylab = "Hourly Wages (HRLYEARN)"
, main = "Females 2009 - Hourly Wage by Industry")
mtext("Industry (NAICS_18)", side=1, line=4.3)
rownames(wage.ind.09fem$stats) <- c("Lower Fence", "Q1", "Median", "Q3", "Upper Fence")
colnames(wage.ind.09fem$stats) <- c(levels(ind.order))
wage.ind.09fem$stats # Boxplot Summary
## AcFood Rtail Agri Mngt ManuN Other Info Whole Const ManuD Trans
## Lower Fence 4.77 3.66 4.81 5.03 5.77 2.00 3.55 5.13 3.50 5.490 3.55
## Q1 9.00 9.50 9.70 10.30 11.50 10.53 11.00 13.00 13.85 14.420 13.50
## Median 10.00 10.95 11.00 13.00 15.00 15.00 16.00 16.83 16.92 18.000 18.00
## Q3 13.00 14.50 14.00 17.00 19.35 19.56 23.00 21.63 20.77 24.105 23.12
## Upper Fence 19.00 22.00 20.00 27.00 31.00 32.97 40.87 34.00 30.29 38.460 37.33
## Finan ProSc Health Utils Educa PubAd Fores
## Lower Fence 5.22 3.08 3.50 3.90 3.07 5.49 5.00
## Q1 15.00 15.00 15.87 19.00 18.00 20.00 18.46
## Median 19.23 20.00 20.00 24.16 24.62 25.00 25.25
## Q3 25.00 28.21 29.07 29.78 33.65 31.87 33.65
## Upper Fence 40.00 47.99 48.73 44.00 57.05 49.60 55.27
wage.ind.09fem$n # Sample size
## [1] 2518 4155 221 821 1007 1127 1079 511 381 640 677 1823 1074 6213 140
## [16] 3114 1878 236
table(wage.ind.09fem$group) # Total Outliers
##
## 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18
## 178 301 14 54 53 68 30 15 25 31 17 98 37 62 5 47 59 9
kruskal.test(HRLYEARN ~ NAICS_18short, data = data.all.09fem) # Diff. among groups
##
## Kruskal-Wallis rank sum test
##
## data: HRLYEARN by NAICS_18short
## Kruskal-Wallis chi-squared = 9226.4, df = 17, p-value < 2.2e-16
# dunnTest(HRLYEARN ~ NAICS_18short, data = data.all.09fem, method = "bonferroni") # Post Hoc
# Females 2019
ind.order <- with(data.all.19fem, reorder(NAICS_18short, HRLYEARN, median))
par(mar=c(15,5,2,1))
wage.ind.19fem <- boxplot(HRLYEARN ~ ind.order
, data = data.all.19fem
, boxwex = 0.4
, ylim = c(0, 120)
, las = 2
, cex.axis = 1.0
, col = "darkorange"
, xlab = ""
, ylab = "Hourly Wages (HRLYEARN)"
, main = "Females 2019 - Hourly Wage by Industry")
mtext("Industry (NAICS_18)", side=1, line=4.3)
rownames(wage.ind.19fem$stats) <- c("Lower Fence", "Q1", "Median", "Q3", "Upper Fence")
colnames(wage.ind.19fem$stats) <- c(levels(ind.order))
wage.ind.19fem$stats # Boxplot Summary
## AcFood Rtail Agri Mngt Other ManuN Info Trans Whole ManuD
## Lower Fence 7.41 8.25 5.770 5.26 4.81 10.50 5.00 3.070 10.30 6.92
## Q1 13.00 13.50 14.000 14.77 15.00 15.34 15.00 17.465 17.50 18.03
## Median 14.50 15.00 16.000 18.00 19.17 19.75 20.00 21.720 22.09 22.50
## Q3 16.86 19.00 20.875 23.67 27.40 26.00 28.35 27.295 28.85 29.81
## Upper Fence 22.50 27.25 30.000 36.54 45.05 41.03 48.21 42.000 45.67 46.77
## Const Health Finan ProSc Educa PubAd Fores Utils
## Lower Fence 6.07 3.50 4.360 3.13 3.48 4.810 4.730 16.07
## Q1 18.00 19.00 19.975 20.00 23.00 25.295 23.875 27.85
## Median 23.00 24.04 25.640 26.44 30.00 31.370 34.000 37.00
## Q3 28.90 35.00 34.055 35.49 42.31 41.325 45.095 46.15
## Upper Fence 45.00 59.00 54.950 58.00 71.15 65.000 72.120 72.82
wage.ind.19fem$n # Sample size
## [1] 2204 3333 252 666 897 819 877 687 482 529 482 6227 1492 1168 3159
## [16] 1804 239 115
table(wage.ind.19fem$group) # Total Outliers
##
## 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18
## 184 272 17 26 35 57 32 44 23 24 26 60 72 40 43 46 1 2
kruskal.test(HRLYEARN ~ NAICS_18short, data = data.all.19fem) # Diff. among groups
##
## Kruskal-Wallis rank sum test
##
## data: HRLYEARN by NAICS_18short
## Kruskal-Wallis chi-squared = 7815.6, df = 17, p-value < 2.2e-16
# dunnTest(HRLYEARN ~ NAICS_18short, data = data.all.19fem, method = "bonferroni") # Post Hoc
# RESULTS BY GROUP #############################################################
# OCCUPATION ###################################################################
# Males 2009
occu.order.male09 <- with(data.all.09male, reorder(NOC_10short, HRLYEARN, median))
par(mar=c(16,5,2,1))
wage.occu.09male <- boxplot(HRLYEARN ~ occu.order.male09
, data = data.all.09male
, boxwex = 0.4
, ylim = c(0, 120)
, las = 2
, cex.axis = 1.0
, col = "slategray2"
, xlab = ""
, ylab = "Hourly Wages (HRLYEARN)"
, main = "Males 2009 - Hourly Wage by Occupation")
mtext("Occupation (NOC_10)", side=1, line=4.3)
rownames(wage.occu.09male$stats) <- c("LowFence", "Q1", "Median", "Q3", "UpFence")
colnames(wage.occu.09male$stats) <- c(levels(occu.order.male09))
wage.occu.09male$stats
## Sales NatAgri ManUtil BusFin ArtCul Trades Health NatASc EduLaw Mngt
## LowFence 3.13 2.14 6.67 3.13 4.210 3.210 7.00 3.79 3.610 3.48
## Q1 10.00 12.00 15.00 15.00 15.000 16.000 18.50 21.63 20.975 23.08
## Median 14.00 17.00 19.50 20.19 20.295 20.675 25.00 28.85 30.000 33.00
## Q3 20.00 25.18 25.00 28.69 28.850 27.300 34.59 38.41 38.460 45.64
## UpFence 35.00 44.41 40.00 49.04 48.210 44.230 57.69 63.57 64.100 79.47
wage.occu.09male$n #Sample size
## [1] 5883 1315 2025 2434 376 8232 525 2632 1479 2041
table(wage.occu.09male$group) #Total Outliers
##
## 1 2 3 4 5 6 7 8 9 10
## 355 41 46 60 5 108 14 60 34 18
kruskal.test(HRLYEARN ~ NOC_10, data = data.all.09male) # Diff. among groups
##
## Kruskal-Wallis rank sum test
##
## data: HRLYEARN by NOC_10
## Kruskal-Wallis chi-squared = 5740.1, df = 9, p-value < 2.2e-16
# dunnTest(HRLYEARN ~ NOC_10, data = data.all.09male, method = "bonferroni") # Post Hoc
# Males 2019
occu.order.male19 <- with(data.all.19male, reorder(NOC_10short, HRLYEARN, median))
par(mar=c(16,5,2,1))
wage.occu.19male <- boxplot(HRLYEARN ~ occu.order.male19
, data = data.all.19male
, boxwex = 0.4
, ylim = c(0, 120)
, las = 2
, cex.axis = 1.0
, col = "royalblue"
, xlab = ""
, ylab = "Hourly Wages (HRLYEARN)"
, main = "Males 2019 - Hourly Wage by Occupation")
mtext("Occupation (NOC_10)", side=1, line=4.3)
rownames(wage.occu.19male$stats) <- c("LowFence", "Q1", "Median", "Q3", "UpFence")
colnames(wage.occu.19male$stats) <- c(levels(occu.order.male19))
wage.occu.19male$stats
## Sales NatAgri ManUtil ArtCul Trades BusFin Health NatASc EduLaw Mngt
## LowFence 3.04 3.21 6.92 9.85 3.460 4.62 10.00 5.13 3.00 3.30
## Q1 14.00 17.00 18.00 17.50 20.000 20.19 21.00 26.62 26.25 32.88
## Median 16.50 23.00 23.36 23.50 26.000 27.00 30.00 36.06 37.50 45.00
## Q3 23.00 33.00 32.00 31.25 34.625 37.50 40.49 46.63 48.08 57.69
## UpFence 36.13 57.00 53.00 51.00 56.500 63.46 68.68 76.51 79.91 94.87
wage.occu.19male$n #Sample size
## [1] 5441 1243 1940 368 7492 2127 622 2751 1802 1713
table(wage.occu.19male$group) #Total Outliers
##
## 1 2 3 4 5 6 7 8 9 10
## 266 51 59 8 95 57 10 61 32 31
kruskal.test(HRLYEARN ~ NOC_10, data = data.all.19male) # Diff. among groups
##
## Kruskal-Wallis rank sum test
##
## data: HRLYEARN by NOC_10
## Kruskal-Wallis chi-squared = 7220.2, df = 9, p-value < 2.2e-16
# dunnTest(HRLYEARN ~ NOC_10, data = data.all.19male, method = "bonferroni") # Post Hoc
# Females 2009
occu.order.fem09 <- with(data.all.09fem, reorder(NOC_10short, HRLYEARN, median))
par(mar=c(16,5,2,1))
wage.occu.09fem <- boxplot(HRLYEARN ~ occu.order.fem09
, data = data.all.09fem
, boxwex = 0.4
, ylim = c(0, 120)
, las = 2
, cex.axis = 1.0
, col = "moccasin"
, xlab = ""
, ylab = "Hourly Wages (HRLYEARN)"
, main = "Females 2009 - Hourly Wage by Occupation")
mtext("Occupation (NOC_10)", side=1, line=4.3)
rownames(wage.occu.09fem$stats) <- c("LoFence", "Q1", "Median", "Q3", "UpFence")
colnames(wage.occu.09fem$stats) <- c(levels(occu.order.fem09))
wage.occu.09fem$stats
## Sales NatAgri ManUtil Trades ArtCul BusFin Health NatASc EduLaw Mngt
## LoFence 2.000 4.81 3.500 4.63 3.55 3.45 4.17 5.000 3.070 4.160
## Q1 9.500 10.00 10.970 11.50 13.00 14.35 17.55 19.220 18.495 18.000
## Median 11.000 12.00 13.685 15.00 18.00 18.50 23.90 25.640 26.000 26.555
## Q3 15.365 16.50 17.150 20.00 26.39 23.08 33.65 33.685 33.750 38.460
## UpFence 24.130 25.96 26.000 32.09 46.15 36.06 57.69 55.000 56.410 68.680
wage.occu.09fem$n #Sample size
## [1] 8695 266 800 534 576 7388 3398 748 3664 1546
table(wage.occu.09fem$group) #Total Outliers
##
## 1 2 3 4 5 6 7 8 9 10
## 330 14 37 14 12 244 14 20 50 22
kruskal.test(HRLYEARN ~ NOC_10, data = data.all.09fem) # Diff. among groups
##
## Kruskal-Wallis rank sum test
##
## data: HRLYEARN by NOC_10
## Kruskal-Wallis chi-squared = 9838.5, df = 9, p-value < 2.2e-16
# dunnTest(HRLYEARN ~ NOC_10, data = data.all.09fem, method = "bonferroni") # Post Hoc
# Females 2019
occu.order.fem19 <- with(data.all.19fem, reorder(NOC_10short, HRLYEARN, median))
par(mar=c(16,5,2,1))
wage.occu.19fem <- boxplot(HRLYEARN ~ occu.order.fem19
, data = data.all.19fem
, boxwex = 0.4
, ylim = c(0, 120)
, las = 2
, cex.axis = 1.0
, col = "darkorange"
, xlab = ""
, ylab = "Hourly Wages (HRLYEARN)"
, main = "Females 2019 - Hourly Wage by Occupation")
mtext("Occupation (NOC_10)", side=1, line=4.3)
rownames(wage.occu.19fem$stats) <- c("LoFence", "Q1", "Median", "Q3", "UpFence")
colnames(wage.occu.19fem$stats) <- c(levels(occu.order.fem19))
wage.occu.19fem$stats
## Sales NatAgri ManUtil Trades ArtCul BusFin Health EduLaw NatASc Mngt
## LoFence 5.00 5.26 10.50 4.55 5.42 3.07 5.05 3.48 6.25 3.30
## Q1 13.75 14.50 15.00 16.54 16.00 19.49 21.00 20.00 24.52 26.92
## Median 15.00 17.00 17.50 20.00 21.00 24.34 27.79 27.88 33.64 38.46
## Q3 19.79 23.97 21.45 26.00 27.89 30.53 40.00 39.90 43.00 52.88
## UpFence 28.72 38.00 31.00 40.15 45.64 47.00 67.31 69.71 70.62 91.35
wage.occu.19fem$n #Sample size
## [1] 7341 329 649 563 493 5958 3455 4569 832 1243
table(wage.occu.19fem$group) #Total Outliers
##
## 1 2 3 4 5 6 7 8 9 10
## 441 27 56 29 9 251 12 71 15 10
kruskal.test(HRLYEARN ~ NOC_10, data = data.all.19fem) # Diff. among groups
##
## Kruskal-Wallis rank sum test
##
## data: HRLYEARN by NOC_10
## Kruskal-Wallis chi-squared = 8336.7, df = 9, p-value < 2.2e-16
# dunnTest(HRLYEARN ~ NOC_10, data = data.all.19fem, method = "bonferroni") # Post Hoc
# 2009 v. 2019 ################################################################
# SECTOR ######################################################################
# Mann Whitney U Test
wilcox.test(HRLYEARN ~ COWMAIN, data = data.all.09male, alt = "two.sided", conf.int = T)
##
## Wilcoxon rank sum test with continuity correction
##
## data: HRLYEARN by COWMAIN
## W = 78664610, p-value < 2.2e-16
## alternative hypothesis: true location shift is not equal to 0
## 95 percent confidence interval:
## 6.700064 7.270075
## sample estimates:
## difference in location
## 6.999958
wilcox.test(HRLYEARN ~ COWMAIN, data = data.all.19male, alt = "two.sided", conf.int = T)
##
## Wilcoxon rank sum test with continuity correction
##
## data: HRLYEARN by COWMAIN
## W = 69385905, p-value < 2.2e-16
## alternative hypothesis: true location shift is not equal to 0
## 95 percent confidence interval:
## 8.039925 8.999984
## sample estimates:
## difference in location
## 8.500087
wilcox.test(HRLYEARN ~ COWMAIN, data = data.all.09fem, alt = "two.sided", conf.int = T)
##
## Wilcoxon rank sum test with continuity correction
##
## data: HRLYEARN by COWMAIN
## W = 133145579, p-value < 2.2e-16
## alternative hypothesis: true location shift is not equal to 0
## 95 percent confidence interval:
## 8.959938 9.269923
## sample estimates:
## difference in location
## 9.039943
wilcox.test(HRLYEARN ~ COWMAIN, data = data.all.19fem, alt = "two.sided", conf.int = T)
##
## Wilcoxon rank sum test with continuity correction
##
## data: HRLYEARN by COWMAIN
## W = 112929303, p-value < 2.2e-16
## alternative hypothesis: true location shift is not equal to 0
## 95 percent confidence interval:
## 9.710015 10.109941
## sample estimates:
## difference in location
## 9.999988
# MALES
# Boxplot Analysis
par(mfrow=c(1, 1))
par(mar=c(16,5,2,1))
summary(data.all$HRLYEARN[data.all$SEX == "Male"])
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 2.14 16.00 23.00 26.19 33.00 115.38
sec.order.male <- with(data.all[data.all$SEX == "Male",], reorder(COWMAIN, HRLYEARN, median))
wage.sec.male <- boxplot(HRLYEARN ~ SURVYEAR * sec.order.male
, data = data.all[data.all$SEX == "Male",]
, boxwex = 0.4
, ylim = c(0, 120)
, las = 1
, cex.axis = 1.0
, col = c("slategray2", "royalblue")
, xaxt = "n"
, xlab = ""
, ylab = "Hourly Wages (HRLYEARN)"
, main = "Males Hourly Wage by Year and Sector")
# Label of X Axis
axis(1
, at = seq(1.5, 4, 2)
, labels = levels(sec.order.male)
, tick=FALSE
, cex=0.3
, las = 1)
# Grey Vertical Lines
for(i in seq(0.5, 6, 2)){
abline(v=i,lty=1, col="grey")
}
# Add a legend
legend("topleft", legend = c("Males 2009", "Males 2019"),
col=c("slategray2", "royalblue"),
pch = 15, bty = "n", pt.cex = 2, cex = 1.0, horiz = T, inset = c(0.01, 0.01))
# Boxplot Stats
rownames(wage.sec.male$stats) <- c("Lower Fence", "Q1", "Median", "Q3", "Upper Fence")
colnames(wage.sec.male$stats) <- wage.sec.male$names
wage.sec.male$stats # Boxplot Summary
## 2009.Private sector 2019.Private sector 2009.Public sector
## Lower Fence 2.140 3.00 3.13
## Q1 13.615 17.31 20.00
## Median 19.290 24.04 27.00
## Q3 27.640 35.00 36.00
## Upper Fence 48.580 61.49 60.00
## 2019.Public sector
## Lower Fence 3.25
## Q1 25.00
## Median 34.00
## Q3 45.00
## Upper Fence 75.00
wage.sec.male$n # Sample size
## [1] 21615 20581 5327 4918
table(wage.sec.male$group) # Total Outliers
##
## 1 2 3 4
## 653 662 105 67
# FEMALES
summary(data.all$HRLYEARN[data.all$SEX == "Female"])
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 2.00 14.00 19.72 22.58 28.05 106.67
sec.order.fem <- with(data.all[data.all$SEX == "Female",], reorder(COWMAIN, HRLYEARN, median))
wage.sec.fem <- boxplot(HRLYEARN ~ SURVYEAR * sec.order.fem
, data = data.all[data.all$SEX == "Female",]
, boxwex = 0.4
, ylim = c(0, 120)
, las = 1
, cex.axis = 1.0
, col = c("moccasin", "darkorange2")
, xaxt = "n"
, xlab = ""
, ylab = "Hourly Wages (HRLYEARN)"
, main = "Females Hourly Wage by Year and Sector")
# Label of X Axis
axis(1
, at = seq(1.5, 4, 2)
, labels = levels(sec.order.fem)
, tick=FALSE
, cex=0.3
, las = 1)
# Grey Vertical Lines
for(i in seq(0.5, 6, 2)){
abline(v=i,lty=1, col="grey")
}
# Add a legend
legend("topleft", legend = c("Females 2009", "Females 2019"),
col=c("moccasin", "darkorange"),
pch = 15, bty = "n", pt.cex = 2, cex = 1.0, horiz = T, inset = c(0.01, 0.01))
# Boxplot Stats
rownames(wage.sec.fem$stats) <- c("Lower Fence", "Q1", "Median", "Q3", "Upper Fence")
colnames(wage.sec.fem$stats) <- wage.sec.fem$names
wage.sec.fem$stats # Boxplot Summary
## 2009.Private sector 2019.Private sector 2009.Public sector
## Lower Fence 2.00 3.13 3.070
## Q1 10.00 15.00 18.500
## Median 14.07 18.75 24.000
## Q3 20.00 25.82 32.695
## Upper Fence 35.00 42.05 53.850
## 2019.Public sector
## Lower Fence 3.07
## Q1 23.00
## Median 30.00
## Q3 40.88
## Upper Fence 67.31
wage.sec.fem$n # Sample size
## [1] 18508 16650 9107 8782
table(wage.sec.fem$group) # Total Outliers
##
## 1 2 3 4
## 839 979 121 117
# 2009 v. 2019 ################################################################
# INDUSTRY ####################################################################
# MALES
# Mann Whitney U Test
wilcox.test(HRLYEARN ~ SURVYEAR, data = data.all[data.all$SEX == "Male" & data.all$NAICS_18short == "Utils",], alt = "two.sided", conf.int = T) # Top1
##
## Wilcoxon rank sum test with continuity correction
##
## data: HRLYEARN by SURVYEAR
## W = 44871, p-value < 2.2e-16
## alternative hypothesis: true location shift is not equal to 0
## 95 percent confidence interval:
## -14.07003 -10.89997
## sample estimates:
## difference in location
## -12.52998
wilcox.test(HRLYEARN ~ SURVYEAR, data = data.all[data.all$SEX == "Male" & data.all$NAICS_18short == "PubAd",], alt = "two.sided", conf.int = T) # Top2
##
## Wilcoxon rank sum test with continuity correction
##
## data: HRLYEARN by SURVYEAR
## W = 1116584, p-value < 2.2e-16
## alternative hypothesis: true location shift is not equal to 0
## 95 percent confidence interval:
## -8.049964 -6.410015
## sample estimates:
## difference in location
## -7.230063
wilcox.test(HRLYEARN ~ SURVYEAR, data = data.all[data.all$SEX == "Male" & data.all$NAICS_18short == "Fores",], alt = "two.sided", conf.int = T) # Top3
##
## Wilcoxon rank sum test with continuity correction
##
## data: HRLYEARN by SURVYEAR
## W = 502548, p-value < 2.2e-16
## alternative hypothesis: true location shift is not equal to 0
## 95 percent confidence interval:
## -10.849993 -8.589955
## sample estimates:
## difference in location
## -9.739952
wilcox.test(HRLYEARN ~ SURVYEAR, data = data.all[data.all$SEX == "Male" & data.all$NAICS_18short == "Educa",], alt = "two.sided", conf.int = T) # Top4
##
## Wilcoxon rank sum test with continuity correction
##
## data: HRLYEARN by SURVYEAR
## W = 677554, p-value < 2.2e-16
## alternative hypothesis: true location shift is not equal to 0
## 95 percent confidence interval:
## -8.579975 -6.310040
## sample estimates:
## difference in location
## -7.450054
wilcox.test(HRLYEARN ~ SURVYEAR, data = data.all[data.all$SEX == "Male" & data.all$NAICS_18short == "ProSc",], alt = "two.sided", conf.int = T) # Top5
##
## Wilcoxon rank sum test with continuity correction
##
## data: HRLYEARN by SURVYEAR
## W = 554060, p-value < 2.2e-16
## alternative hypothesis: true location shift is not equal to 0
## 95 percent confidence interval:
## -6.899933 -4.619922
## sample estimates:
## difference in location
## -5.719948
# Boxplot Analysis
par(mfrow=c(1, 1))
par(mar=c(16,5,2,1))
summary(data.all$HRLYEARN[data.all$SEX == "Male"])
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 2.14 16.00 23.00 26.19 33.00 115.38
ind.order.male <- with(data.all[data.all$SEX == "Male",], reorder(NAICS_18short, HRLYEARN, median))
wage.ind.male <- boxplot(HRLYEARN ~ SURVYEAR * ind.order.male
, data = data.all[data.all$SEX == "Male",]
, boxwex = 0.4
, ylim = c(0, 120)
, cex.axis = 1.0
, col = c("slategray2", "royalblue")
, xaxt = "n"
, xlab = ""
, ylab = "Hourly Wages (HRLYEARN)"
, main = "Males Hourly Wage by Year and Industry")
mtext("Industry (NAICS_18)", side=1, line=4.3)
# Label of X Axis
axis(1
, at = seq(1.5, 36, 2)
, labels = levels(ind.order.male)
, tick=FALSE
, cex=0.3
, las = 2)
# Grey Vertical Lines
for(i in seq(0.5, 40, 2)){
abline(v=i,lty=1, col="grey")
}
# Add a legend
legend("topleft", legend = c("Males 2009", "Males 2019"),
col=c("slategray2", "royalblue"),
pch = 15, bty = "n", pt.cex = 2, cex = 1.0, horiz = T, inset = c(0.01, 0.01))
# Boxplot Stats
rownames(wage.ind.male$stats) <- c("Lower Fence", "Q1", "Median", "Q3", "Upper Fence")
colnames(wage.ind.male$stats) <- wage.ind.male$names
wage.ind.male$stats # Boxplot Summary
## 2009.AcFood 2019.AcFood 2009.Rtail 2019.Rtail 2009.Agri 2019.Agri
## Lower Fence 5.00 6.92 3.79 3.040 2.14 3.30
## Q1 9.35 13.15 10.00 14.000 10.00 15.00
## Median 11.00 15.00 14.00 16.750 13.55 18.47
## Q3 15.00 17.61 20.14 24.855 17.31 23.08
## Upper Fence 23.08 24.18 35.20 41.080 28.00 35.00
## 2009.Mngt 2019.Mngt 2009.Other 2019.Other 2009.Info 2019.Info
## Lower Fence 3.33 4.56 3.13 3.000 3.380 4.62
## Q1 11.00 15.00 13.70 18.000 13.000 16.00
## Median 14.00 18.50 18.90 23.080 19.815 23.05
## Q3 20.00 25.00 25.00 30.965 28.850 34.07
## Upper Fence 33.00 40.00 41.83 50.000 51.920 60.22
## 2009.ManuN 2019.ManuN 2009.Trans 2019.Trans 2009.Whole 2019.Whole
## Lower Fence 5.45 3.53 3.210 5.00 4.25 9.62
## Q1 15.00 18.50 16.000 19.35 15.00 20.00
## Median 20.00 25.00 20.510 25.00 20.00 26.00
## Q3 28.00 34.00 25.985 32.00 27.47 36.00
## Upper Fence 47.16 56.54 40.870 50.96 46.15 58.50
## 2009.ManuD 2019.ManuD 2009.Health 2019.Health 2009.Const 2019.Const
## Lower Fence 3.48 5.29 3.300 4.81 5.10 8.170
## Q1 17.00 20.00 16.465 20.00 17.00 22.000
## Median 22.00 26.00 21.450 26.25 22.00 28.745
## Q3 29.51 35.00 31.000 37.90 29.87 37.000
## Upper Fence 48.08 57.50 52.200 63.37 49.15 58.000
## 2009.Finan 2019.Finan 2009.Fores 2019.Fores 2009.ProSc 2019.ProSc
## Lower Fence 3.75 3.53 4.44 3.21 5.00 5.77
## Q1 15.34 21.00 19.78 27.78 20.19 25.00
## Median 22.00 29.77 28.00 37.00 28.29 35.00
## Q3 33.65 42.31 34.62 49.04 38.74 46.67
## Upper Fence 60.00 72.92 55.77 80.77 65.93 78.85
## 2009.Educa 2019.Educa 2009.PubAd 2019.PubAd 2009.Utils 2019.Utils
## Lower Fence 3.610 3.25 3.13 5.13 5.13 14.000
## Q1 19.700 24.22 22.03 28.00 23.50 35.000
## Median 28.280 36.06 30.00 37.00 30.64 43.475
## Q3 37.555 48.08 37.91 47.00 38.00 52.000
## Upper Fence 64.100 82.05 61.54 75.00 58.24 76.920
wage.ind.male$n # Sample size
## [1] 1429 1336 2994 2792 497 457 992 955 929 819 1106 981 1776 1489 1860
## [16] 1872 1258 1108 2605 2207 1067 1205 3209 3264 971 1014 1330 1242 1086 1326
## [31] 1460 1306 1880 1746 493 380
table(wage.ind.male$group) # Total Outliers
##
## 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20
## 80 118 136 155 14 33 64 53 33 36 23 21 60 61 96 71 49 40 76 66
## 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36
## 29 29 41 60 30 39 49 17 38 37 19 19 39 18 10 7
# FEMALES
# Mann Whitney U Test
wilcox.test(HRLYEARN ~ SURVYEAR, data = data.all[data.all$SEX == "Female" & data.all$NAICS_18short == "Utils",], alt = "two.sided", conf.int = T) # Top1
##
## Wilcoxon rank sum test with continuity correction
##
## data: HRLYEARN by SURVYEAR
## W = 3277.5, p-value = 3.859e-16
## alternative hypothesis: true location shift is not equal to 0
## 95 percent confidence interval:
## -14.999966 -9.439992
## sample estimates:
## difference in location
## -12.13
wilcox.test(HRLYEARN ~ SURVYEAR, data = data.all[data.all$SEX == "Female" & data.all$NAICS_18short == "Fores",], alt = "two.sided", conf.int = T) # Top3
##
## Wilcoxon rank sum test with continuity correction
##
## data: HRLYEARN by SURVYEAR
## W = 18717, p-value = 2.28e-10
## alternative hypothesis: true location shift is not equal to 0
## 95 percent confidence interval:
## -10.649967 -5.650075
## sample estimates:
## difference in location
## -8.030013
wilcox.test(HRLYEARN ~ SURVYEAR, data = data.all[data.all$SEX == "Female" & data.all$NAICS_18short == "PubAd",], alt = "two.sided", conf.int = T) # Top2
##
## Wilcoxon rank sum test with continuity correction
##
## data: HRLYEARN by SURVYEAR
## W = 1070692, p-value < 2.2e-16
## alternative hypothesis: true location shift is not equal to 0
## 95 percent confidence interval:
## -7.280006 -5.999961
## sample estimates:
## difference in location
## -6.669972
wilcox.test(HRLYEARN ~ SURVYEAR, data = data.all[data.all$SEX == "Female" & data.all$NAICS_18short == "Educa",], alt = "two.sided", conf.int = T) # Top4
##
## Wilcoxon rank sum test with continuity correction
##
## data: HRLYEARN by SURVYEAR
## W = 3546959, p-value < 2.2e-16
## alternative hypothesis: true location shift is not equal to 0
## 95 percent confidence interval:
## -6.269986 -5.099947
## sample estimates:
## difference in location
## -5.749971
wilcox.test(HRLYEARN ~ SURVYEAR, data = data.all[data.all$SEX == "Female" & data.all$NAICS_18short == "ProSc",], alt = "two.sided", conf.int = T) # Top5
##
## Wilcoxon rank sum test with continuity correction
##
## data: HRLYEARN by SURVYEAR
## W = 404999, p-value < 2.2e-16
## alternative hypothesis: true location shift is not equal to 0
## 95 percent confidence interval:
## -6.970001 -5.270070
## sample estimates:
## difference in location
## -6.040043
# Boxplot Analysis
par(mfrow=c(1, 1))
par(mar=c(16,5,2,1))
summary(data.all$HRLYEARN[data.all$SEX == "Female"])
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 2.00 14.00 19.72 22.58 28.05 106.67
ind.order.fem <- with(data.all[data.all$SEX == "Female",], reorder(NAICS_18short, HRLYEARN, median))
wage.ind.fem <- boxplot(HRLYEARN ~ SURVYEAR * ind.order.fem
, data = data.all[data.all$SEX == "Female",]
, boxwex = 0.4
, ylim = c(0, 120)
, cex.axis = 1.0
, col = c("moccasin", "darkorange")
, xaxt = "n"
, xlab = ""
, ylab = "Hourly Wages (HRLYEARN)"
, main = "Females Hourly Wage by Year and Industry")
mtext("Industry (NAICS_18)", side=1, line=4.3)
# Label of X Axis
axis(1
, at = seq(1.5, 36, 2)
, labels = levels(ind.order.fem)
, tick=FALSE
, cex=0.3
, las = 2)
# Grey Vertical Lines
for(i in seq(0.5, 40, 2)){
abline(v=i,lty=1, col="grey")
}
# Add a legend
legend("topleft", legend = c("Females 2009", "Females 2019"),
col=c("moccasin", "darkorange"),
pch = 15, bty = "n", pt.cex = 2, cex = 1.0, horiz = T, inset = c(0.01, 0.01))
# Boxplot Stats
rownames(wage.ind.fem$stats) <- c("Lower Fence", "Q1", "Median", "Q3", "Upper Fence")
colnames(wage.ind.fem$stats) <- wage.ind.fem$names
wage.ind.fem$stats # Boxplot Summary
## 2009.AcFood 2019.AcFood 2009.Rtail 2019.Rtail 2009.Agri 2019.Agri
## Lower Fence 4.77 7.41 3.66 8.25 4.81 5.770
## Q1 9.00 13.00 9.50 13.50 9.70 14.000
## Median 10.00 14.50 10.95 15.00 11.00 16.000
## Q3 13.00 16.86 14.50 19.00 14.00 20.875
## Upper Fence 19.00 22.50 22.00 27.25 20.00 30.000
## 2009.Mngt 2019.Mngt 2009.Other 2019.Other 2009.ManuN 2019.ManuN
## Lower Fence 5.03 5.26 2.00 4.81 5.77 10.50
## Q1 10.30 14.77 10.53 15.00 11.50 15.34
## Median 13.00 18.00 15.00 19.17 15.00 19.75
## Q3 17.00 23.67 19.56 27.40 19.35 26.00
## Upper Fence 27.00 36.54 32.97 45.05 31.00 41.03
## 2009.Info 2019.Info 2009.Whole 2019.Whole 2009.Const 2019.Const
## Lower Fence 3.55 5.00 5.13 10.30 3.50 6.07
## Q1 11.00 15.00 13.00 17.50 13.85 18.00
## Median 16.00 20.00 16.83 22.09 16.92 23.00
## Q3 23.00 28.35 21.63 28.85 20.77 28.90
## Upper Fence 40.87 48.21 34.00 45.67 30.29 45.00
## 2009.ManuD 2019.ManuD 2009.Trans 2019.Trans 2009.Finan 2019.Finan
## Lower Fence 5.490 6.92 3.55 3.070 5.22 4.360
## Q1 14.420 18.03 13.50 17.465 15.00 19.975
## Median 18.000 22.50 18.00 21.720 19.23 25.640
## Q3 24.105 29.81 23.12 27.295 25.00 34.055
## Upper Fence 38.460 46.77 37.33 42.000 40.00 54.950
## 2009.Health 2019.Health 2009.ProSc 2019.ProSc 2009.Educa 2019.Educa
## Lower Fence 3.50 3.50 3.08 3.13 3.07 3.48
## Q1 15.87 19.00 15.00 20.00 18.00 23.00
## Median 20.00 24.04 20.00 26.44 24.62 30.00
## Q3 29.07 35.00 28.21 35.49 33.65 42.31
## Upper Fence 48.73 59.00 47.99 58.00 57.05 71.15
## 2009.PubAd 2019.PubAd 2009.Utils 2019.Utils 2009.Fores 2019.Fores
## Lower Fence 5.49 4.810 3.90 16.07 5.00 4.730
## Q1 20.00 25.295 19.00 27.85 18.46 23.875
## Median 25.00 31.370 24.16 37.00 25.25 34.000
## Q3 31.87 41.325 29.78 46.15 33.65 45.095
## Upper Fence 49.60 65.000 44.00 72.82 55.27 72.120
wage.ind.fem$n # Sample size
## [1] 2518 2204 4155 3333 221 252 821 666 1127 897 1007 819 1079 877 511
## [16] 482 381 482 640 529 677 687 1823 1492 6213 6227 1074 1168 3114 3159
## [31] 1878 1804 140 115 236 239
table(wage.ind.fem$group) # Total Outliers
##
## 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20
## 178 184 301 272 14 17 54 26 68 35 53 57 30 32 15 23 25 26 31 24
## 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36
## 17 44 98 72 62 60 37 40 47 43 59 46 5 2 9 1
# 2009 v. 2019 ################################################################
# OCCUPATION ##################################################################
# MALES
# Mann Whitney U Test
wilcox.test(HRLYEARN ~ SURVYEAR, data = data.all[data.all$SEX == "Male" & data.all$NOC_10short == "Mngt",], alt = "two.sided", conf.int = T) # Top1
##
## Wilcoxon rank sum test with continuity correction
##
## data: HRLYEARN by SURVYEAR
## W = 1120265, p-value < 2.2e-16
## alternative hypothesis: true location shift is not equal to 0
## 95 percent confidence interval:
## -12.25999 -10.00005
## sample estimates:
## difference in location
## -11.17997
wilcox.test(HRLYEARN ~ SURVYEAR, data = data.all[data.all$SEX == "Male" & data.all$NOC_10short == "EduLaw",], alt = "two.sided", conf.int = T) # Top2
##
## Wilcoxon rank sum test with continuity correction
##
## data: HRLYEARN by SURVYEAR
## W = 966244, p-value < 2.2e-16
## alternative hypothesis: true location shift is not equal to 0
## 95 percent confidence interval:
## -8.059976 -6.049982
## sample estimates:
## difference in location
## -7.05005
wilcox.test(HRLYEARN ~ SURVYEAR, data = data.all[data.all$SEX == "Male" & data.all$NOC_10short == "NatASc",], alt = "two.sided", conf.int = T) # Top3
##
## Wilcoxon rank sum test with continuity correction
##
## data: HRLYEARN by SURVYEAR
## W = 2564647, p-value < 2.2e-16
## alternative hypothesis: true location shift is not equal to 0
## 95 percent confidence interval:
## -7.489955 -6.020037
## sample estimates:
## difference in location
## -6.770048
# Boxplot Analysis
occu.order.male <- with(data.all[data.all$SEX == "Male",], reorder(NOC_10short, HRLYEARN, median))
par(mar=c(16,5,2,1))
wage.occu.male <- boxplot(HRLYEARN ~ SURVYEAR * occu.order.male
, data = data.all[data.all$SEX == "Male",]
, boxwex = 0.4
, las = 2
, cex.axis = 1.0
, col = c("slategray2", "royalblue")
, xaxt = "n"
, xlab = ""
, ylab = "Hourly Wages (HRLYEARN)"
, main = "Males Hourly Wage by Year and Occupation")
mtext("Occupation (NOC_10)", side=1, line=4.3)
# Label of X Axis
axis(1
, at = seq(1.5, 20, 2)
, labels = levels(occu.order.male)
, tick=FALSE
, cex=0.3
, las = 2)
# Grey Vertical Lines
for(i in seq(0.5, 20, 2)){
abline(v=i,lty=1, col="grey")
}
# Add a legend
legend("topleft", legend = c("Males 2009", "Males 2019"),
col=c("slategray2", "royalblue"),
pch = 15, bty = "n", pt.cex = 2, cex = 1.0, horiz = T, inset = c(0.01, 0.01))
wage.occu.male$stats
## [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10] [,11] [,12]
## [1,] 3.13 3.04 2.14 3.21 6.67 6.92 4.210 9.85 3.210 3.460 3.13 4.62
## [2,] 10.00 14.00 12.00 17.00 15.00 18.00 15.000 17.50 16.000 20.000 15.00 20.19
## [3,] 14.00 16.50 17.00 23.00 19.50 23.36 20.295 23.50 20.675 26.000 20.19 27.00
## [4,] 20.00 23.00 25.18 33.00 25.00 32.00 28.850 31.25 27.300 34.625 28.69 37.50
## [5,] 35.00 36.13 44.41 57.00 40.00 53.00 48.210 51.00 44.230 56.500 49.04 63.46
## [,13] [,14] [,15] [,16] [,17] [,18] [,19] [,20]
## [1,] 7.00 10.00 3.79 5.13 3.610 3.00 3.48 3.30
## [2,] 18.50 21.00 21.63 26.62 20.975 26.25 23.08 32.88
## [3,] 25.00 30.00 28.85 36.06 30.000 37.50 33.00 45.00
## [4,] 34.59 40.49 38.41 46.63 38.460 48.08 45.64 57.69
## [5,] 57.69 68.68 63.57 76.51 64.100 79.91 79.47 94.87
# Boxplot Stats
rownames(wage.occu.male$stats) <- c("LowFence", "Q1", "Median", "Q3", "UpFence")
colnames(wage.occu.male$stats) <- wage.occu.male$names
wage.occu.male$stats # Boxplot Summary
## 2009.Sales 2019.Sales 2009.NatAgri 2019.NatAgri 2009.ManUtil
## LowFence 3.13 3.04 2.14 3.21 6.67
## Q1 10.00 14.00 12.00 17.00 15.00
## Median 14.00 16.50 17.00 23.00 19.50
## Q3 20.00 23.00 25.18 33.00 25.00
## UpFence 35.00 36.13 44.41 57.00 40.00
## 2019.ManUtil 2009.ArtCul 2019.ArtCul 2009.Trades 2019.Trades
## LowFence 6.92 4.210 9.85 3.210 3.460
## Q1 18.00 15.000 17.50 16.000 20.000
## Median 23.36 20.295 23.50 20.675 26.000
## Q3 32.00 28.850 31.25 27.300 34.625
## UpFence 53.00 48.210 51.00 44.230 56.500
## 2009.BusFin 2019.BusFin 2009.Health 2019.Health 2009.NatASc
## LowFence 3.13 4.62 7.00 10.00 3.79
## Q1 15.00 20.19 18.50 21.00 21.63
## Median 20.19 27.00 25.00 30.00 28.85
## Q3 28.69 37.50 34.59 40.49 38.41
## UpFence 49.04 63.46 57.69 68.68 63.57
## 2019.NatASc 2009.EduLaw 2019.EduLaw 2009.Mngt 2019.Mngt
## LowFence 5.13 3.610 3.00 3.48 3.30
## Q1 26.62 20.975 26.25 23.08 32.88
## Median 36.06 30.000 37.50 33.00 45.00
## Q3 46.63 38.460 48.08 45.64 57.69
## UpFence 76.51 64.100 79.91 79.47 94.87
wage.occu.male$n # Sample size
## [1] 5883 5441 1315 1243 2025 1940 376 368 8232 7492 2434 2127 525 622 2632
## [16] 2751 1479 1802 2041 1713
table(wage.occu.male$group) # Total Outliers
##
## 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20
## 355 266 41 51 46 59 5 8 108 95 60 57 14 10 60 61 34 32 18 31
# FEMALES
# Mann Whitney U Test
wilcox.test(HRLYEARN ~ SURVYEAR, data = data.all[data.all$SEX == "Female" & data.all$NOC_10short == "Mngt",], alt = "two.sided", conf.int = T) # Top1
##
## Wilcoxon rank sum test with continuity correction
##
## data: HRLYEARN by SURVYEAR
## W = 593358, p-value < 2.2e-16
## alternative hypothesis: true location shift is not equal to 0
## 95 percent confidence interval:
## -12.019993 -9.619946
## sample estimates:
## difference in location
## -10.81004
wilcox.test(HRLYEARN ~ SURVYEAR, data = data.all[data.all$SEX == "Female" & data.all$NOC_10short == "NatASc",], alt = "two.sided", conf.int = T) # Top2
##
## Wilcoxon rank sum test with continuity correction
##
## data: HRLYEARN by SURVYEAR
## W = 208380, p-value < 2.2e-16
## alternative hypothesis: true location shift is not equal to 0
## 95 percent confidence interval:
## -8.200015 -5.870033
## sample estimates:
## difference in location
## -7.000035
wilcox.test(HRLYEARN ~ SURVYEAR, data = data.all[data.all$SEX == "Female" & data.all$NOC_10short == "EduLaw",], alt = "two.sided", conf.int = T) # Top3
##
## Wilcoxon rank sum test with continuity correction
##
## data: HRLYEARN by SURVYEAR
## W = 7142727, p-value < 2.2e-16
## alternative hypothesis: true location shift is not equal to 0
## 95 percent confidence interval:
## -3.560001 -2.509974
## sample estimates:
## difference in location
## -3.000021
# Boxplot Analysis
occu.order.fem <- with(data.all[data.all$SEX == "Female",], reorder(NOC_10short, HRLYEARN, median))
par(mar=c(16,5,2,1))
wage.occu.fem <- boxplot(HRLYEARN ~ SURVYEAR * occu.order.fem
, data = data.all[data.all$SEX == "Female",]
, boxwex = 0.4
, las = 2
, cex.axis = 1.0
, col = c("moccasin", "darkorange")
, xaxt = "n"
, xlab = ""
, ylab = "Hourly Wages (HRLYEARN)"
, main = "Females Hourly Wage by Year and Occupation")
mtext("Occupation (NOC_10)", side=1, line=4.3)
# Label of X Axis
axis(1
, at = seq(1.5, 20, 2)
, labels = levels(occu.order.fem)
, tick=FALSE
, cex=0.3
, las = 2)
# Grey Vertical Lines
for(i in seq(0.5, 20, 2)){
abline(v=i,lty=1, col="grey")
}
# Add a legend
legend("topleft", legend = c("Females 2009", "Females 2019"),
col=c("moccasin", "darkorange"),
pch = 15, bty = "n", pt.cex = 2, cex = 1.0, horiz = T, inset = c(0.01, 0.01))
wage.occu.fem$stats
## [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10] [,11] [,12]
## [1,] 2.000 5.00 4.81 5.26 3.500 10.50 4.63 4.55 3.55 5.42 3.45 3.07
## [2,] 9.500 13.75 10.00 14.50 10.970 15.00 11.50 16.54 13.00 16.00 14.35 19.49
## [3,] 11.000 15.00 12.00 17.00 13.685 17.50 15.00 20.00 18.00 21.00 18.50 24.34
## [4,] 15.365 19.79 16.50 23.97 17.150 21.45 20.00 26.00 26.39 27.89 23.08 30.53
## [5,] 24.130 28.72 25.96 38.00 26.000 31.00 32.09 40.15 46.15 45.64 36.06 47.00
## [,13] [,14] [,15] [,16] [,17] [,18] [,19] [,20]
## [1,] 4.17 5.05 3.070 3.48 5.000 6.25 4.160 3.30
## [2,] 17.55 21.00 18.495 20.00 19.220 24.52 18.000 26.92
## [3,] 23.90 27.79 26.000 27.88 25.640 33.64 26.555 38.46
## [4,] 33.65 40.00 33.750 39.90 33.685 43.00 38.460 52.88
## [5,] 57.69 67.31 56.410 69.71 55.000 70.62 68.680 91.35
# Boxplot Stats
rownames(wage.occu.fem$stats) <- c("LoFence", "Q1", "Median", "Q3", "UpFence")
colnames(wage.occu.fem$stats) <- wage.occu.fem$names
wage.occu.fem$stats # Boxplot Summary
## 2009.Sales 2019.Sales 2009.NatAgri 2019.NatAgri 2009.ManUtil
## LoFence 2.000 5.00 4.81 5.26 3.500
## Q1 9.500 13.75 10.00 14.50 10.970
## Median 11.000 15.00 12.00 17.00 13.685
## Q3 15.365 19.79 16.50 23.97 17.150
## UpFence 24.130 28.72 25.96 38.00 26.000
## 2019.ManUtil 2009.Trades 2019.Trades 2009.ArtCul 2019.ArtCul
## LoFence 10.50 4.63 4.55 3.55 5.42
## Q1 15.00 11.50 16.54 13.00 16.00
## Median 17.50 15.00 20.00 18.00 21.00
## Q3 21.45 20.00 26.00 26.39 27.89
## UpFence 31.00 32.09 40.15 46.15 45.64
## 2009.BusFin 2019.BusFin 2009.Health 2019.Health 2009.EduLaw 2019.EduLaw
## LoFence 3.45 3.07 4.17 5.05 3.070 3.48
## Q1 14.35 19.49 17.55 21.00 18.495 20.00
## Median 18.50 24.34 23.90 27.79 26.000 27.88
## Q3 23.08 30.53 33.65 40.00 33.750 39.90
## UpFence 36.06 47.00 57.69 67.31 56.410 69.71
## 2009.NatASc 2019.NatASc 2009.Mngt 2019.Mngt
## LoFence 5.000 6.25 4.160 3.30
## Q1 19.220 24.52 18.000 26.92
## Median 25.640 33.64 26.555 38.46
## Q3 33.685 43.00 38.460 52.88
## UpFence 55.000 70.62 68.680 91.35
wage.occu.fem$n # Sample size
## [1] 8695 7341 266 329 800 649 534 563 576 493 7388 5958 3398 3455 3664
## [16] 4569 748 832 1546 1243
table(wage.occu.fem$group) # Total Outliers
##
## 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20
## 330 441 14 27 37 56 14 29 12 9 244 251 14 12 50 71 20 15 22 10
# Males v. Females #############################################################
# SECTOR #######################################################################
# Mann Whitney U Tests
wilcox.test(HRLYEARN ~ SEX, data = data.all[data.all$SURVYEAR == 2019 & data.all$COWMAIN == "Private sector",], alt = "two.sided", conf.int = T)
##
## Wilcoxon rank sum test with continuity correction
##
## data: HRLYEARN by SEX
## W = 218257664, p-value < 2.2e-16
## alternative hypothesis: true location shift is not equal to 0
## 95 percent confidence interval:
## 4.000026 4.499960
## sample estimates:
## difference in location
## 4.220031
wilcox.test(HRLYEARN ~ SEX, data = data.all[data.all$SURVYEAR == 2019 & data.all$COWMAIN == "Public sector",], alt = "two.sided", conf.int = T)
##
## Wilcoxon rank sum test with continuity correction
##
## data: HRLYEARN by SEX
## W = 24650974, p-value < 2.2e-16
## alternative hypothesis: true location shift is not equal to 0
## 95 percent confidence interval:
## 2.729960 3.609973
## sample estimates:
## difference in location
## 3.090046
# Boxplot Analysis
sec.order.19 <- with(data.all[data.all$SURVYEAR == 2019,], reorder(COWMAIN, HRLYEARN, median))
par(mar=c(16,5,2,1))
wage.sec.19 <- boxplot(HRLYEARN ~ SEX * sec.order.19
, data = data.all[data.all$SURVYEAR == 2019,]
, boxwex = 0.4
, ylim = c(0, 120)
, las = 2
, cex.axis = 1.0
, col = c("royalblue", "darkorange")
, xaxt = "n"
, xlab = ""
, ylab = "Hourly Wages (HRLYEARN)"
, main = "Hourly Wage by Sector - 2019 Gender Comparison")
# Label of X Axis
axis(1
, at = seq(1.5, 4, 2)
, labels = levels(sec.order.19)
, tick=FALSE
, cex=0.3
, las = 1)
# Grey Vertical Lines
for(i in seq(0.5, 6, 2)){
abline(v=i,lty=1, col="grey")
}
# Add a legend
legend("topleft", legend = c("Males 2019", "Females 2019"),
col=c("royalblue", "darkorange"),
pch = 15, bty = "n", pt.cex = 2, cex = 1.0, horiz = T, inset = c(0.01, 0.01))
wage.sec.19$stats
## [,1] [,2] [,3] [,4]
## [1,] 3.00 3.13 3.25 3.07
## [2,] 17.31 15.00 25.00 23.00
## [3,] 24.04 18.75 34.00 30.00
## [4,] 35.00 25.82 45.00 40.88
## [5,] 61.49 42.05 75.00 67.31
# Males v. Females #############################################################
# INDUSTRY #####################################################################
# Mann Whitney U Tests
wilcox.test(HRLYEARN ~ SEX, data = data.all[data.all$SURVYEAR == 2019 & data.all$NAICS_18short == "Agri",], alt = "two.sided", conf.int = T)
##
## Wilcoxon rank sum test with continuity correction
##
## data: HRLYEARN by SEX
## W = 68024, p-value = 6.254e-05
## alternative hypothesis: true location shift is not equal to 0
## 95 percent confidence interval:
## 0.9999976 2.5000029
## sample estimates:
## difference in location
## 1.750036
wilcox.test(HRLYEARN ~ SEX, data = data.all[data.all$SURVYEAR == 2019 & data.all$NAICS_18short == "Fores",], alt = "two.sided", conf.int = T)
##
## Wilcoxon rank sum test with continuity correction
##
## data: HRLYEARN by SEX
## W = 166042, p-value = 0.003606
## alternative hypothesis: true location shift is not equal to 0
## 95 percent confidence interval:
## 1.000054 5.290070
## sample estimates:
## difference in location
## 3.100048
wilcox.test(HRLYEARN ~ SEX, data = data.all[data.all$SURVYEAR == 2019 & data.all$NAICS_18short == "Utils",], alt = "two.sided", conf.int = T)
##
## Wilcoxon rank sum test with continuity correction
##
## data: HRLYEARN by SEX
## W = 27312, p-value = 4.826e-05
## alternative hypothesis: true location shift is not equal to 0
## 95 percent confidence interval:
## 3.089948 8.900052
## sample estimates:
## difference in location
## 6.000049
wilcox.test(HRLYEARN ~ SEX, data = data.all[data.all$SURVYEAR == 2019 & data.all$NAICS_18short == "Const",], alt = "two.sided", conf.int = T)
##
## Wilcoxon rank sum test with continuity correction
##
## data: HRLYEARN by SEX
## W = 1020349, p-value < 2.2e-16
## alternative hypothesis: true location shift is not equal to 0
## 95 percent confidence interval:
## 3.999967 5.829932
## sample estimates:
## difference in location
## 4.999997
wilcox.test(HRLYEARN ~ SEX, data = data.all[data.all$SURVYEAR == 2019 & data.all$NAICS_18short == "ManuD",], alt = "two.sided", conf.int = T)
##
## Wilcoxon rank sum test with continuity correction
##
## data: HRLYEARN by SEX
## W = 699118, p-value = 1.544e-12
## alternative hypothesis: true location shift is not equal to 0
## 95 percent confidence interval:
## 2.100034 3.950029
## sample estimates:
## difference in location
## 3.000056
wilcox.test(HRLYEARN ~ SEX, data = data.all[data.all$SURVYEAR == 2019 & data.all$NAICS_18short == "ManuN",], alt = "two.sided", conf.int = T)
##
## Wilcoxon rank sum test with continuity correction
##
## data: HRLYEARN by SEX
## W = 790826, p-value < 2.2e-16
## alternative hypothesis: true location shift is not equal to 0
## 95 percent confidence interval:
## 3.550011 5.000051
## sample estimates:
## difference in location
## 4.24996
wilcox.test(HRLYEARN ~ SEX, data = data.all[data.all$SURVYEAR == 2019 & data.all$NAICS_18short == "Whole",], alt = "two.sided", conf.int = T)
##
## Wilcoxon rank sum test with continuity correction
##
## data: HRLYEARN by SEX
## W = 327178, p-value = 8.746e-13
## alternative hypothesis: true location shift is not equal to 0
## 95 percent confidence interval:
## 2.500003 4.500087
## sample estimates:
## difference in location
## 3.499906
wilcox.test(HRLYEARN ~ SEX, data = data.all[data.all$SURVYEAR == 2019 & data.all$NAICS_18short == "Rtail",], alt = "two.sided", conf.int = T)
##
## Wilcoxon rank sum test with continuity correction
##
## data: HRLYEARN by SEX
## W = 5449884, p-value < 2.2e-16
## alternative hypothesis: true location shift is not equal to 0
## 95 percent confidence interval:
## 1.000037 1.500014
## sample estimates:
## difference in location
## 1.25008
wilcox.test(HRLYEARN ~ SEX, data = data.all[data.all$SURVYEAR == 2019 & data.all$NAICS_18short == "Trans",], alt = "two.sided", conf.int = T)
##
## Wilcoxon rank sum test with continuity correction
##
## data: HRLYEARN by SEX
## W = 756647, p-value = 6.886e-12
## alternative hypothesis: true location shift is not equal to 0
## 95 percent confidence interval:
## 1.929985 3.349944
## sample estimates:
## difference in location
## 2.600061
wilcox.test(HRLYEARN ~ SEX, data = data.all[data.all$SURVYEAR == 2019 & data.all$NAICS_18short == "Finan",], alt = "two.sided", conf.int = T)
##
## Wilcoxon rank sum test with continuity correction
##
## data: HRLYEARN by SEX
## W = 879092, p-value = 5.238e-12
## alternative hypothesis: true location shift is not equal to 0
## 95 percent confidence interval:
## 2.469944 4.499994
## sample estimates:
## difference in location
## 3.479976
wilcox.test(HRLYEARN ~ SEX, data = data.all[data.all$SURVYEAR == 2019 & data.all$NAICS_18short == "ProSc",], alt = "two.sided", conf.int = T)
##
## Wilcoxon rank sum test with continuity correction
##
## data: HRLYEARN by SEX
## W = 1019833, p-value < 2.2e-16
## alternative hypothesis: true location shift is not equal to 0
## 95 percent confidence interval:
## 6.169975 8.279945
## sample estimates:
## difference in location
## 7.210066
wilcox.test(HRLYEARN ~ SEX, data = data.all[data.all$SURVYEAR == 2019 & data.all$NAICS_18short == "Mngt",], alt = "two.sided", conf.int = T)
##
## Wilcoxon rank sum test with continuity correction
##
## data: HRLYEARN by SEX
## W = 346817, p-value = 0.001885
## alternative hypothesis: true location shift is not equal to 0
## 95 percent confidence interval:
## 0.250000 1.499922
## sample estimates:
## difference in location
## 0.9999967
wilcox.test(HRLYEARN ~ SEX, data = data.all[data.all$SURVYEAR == 2019 & data.all$NAICS_18short == "Educa",], alt = "two.sided", conf.int = T)
##
## Wilcoxon rank sum test with continuity correction
##
## data: HRLYEARN by SEX
## W = 2383919, p-value = 2.516e-16
## alternative hypothesis: true location shift is not equal to 0
## 95 percent confidence interval:
## 3.000094 4.999943
## sample estimates:
## difference in location
## 4.000059
wilcox.test(HRLYEARN ~ SEX, data = data.all[data.all$SURVYEAR == 2019 & data.all$NAICS_18short == "Health",], alt = "two.sided", conf.int = T)
##
## Wilcoxon rank sum test with continuity correction
##
## data: HRLYEARN by SEX
## W = 4094719, p-value = 4.884e-07
## alternative hypothesis: true location shift is not equal to 0
## 95 percent confidence interval:
## 0.9999984 2.1500867
## sample estimates:
## difference in location
## 1.599926
wilcox.test(HRLYEARN ~ SEX, data = data.all[data.all$SURVYEAR == 2019 & data.all$NAICS_18short == "Info",], alt = "two.sided", conf.int = T)
##
## Wilcoxon rank sum test with continuity correction
##
## data: HRLYEARN by SEX
## W = 500721, p-value = 9.825e-10
## alternative hypothesis: true location shift is not equal to 0
## 95 percent confidence interval:
## 1.770008 3.499980
## sample estimates:
## difference in location
## 2.549954
wilcox.test(HRLYEARN ~ SEX, data = data.all[data.all$SURVYEAR == 2019 & data.all$NAICS_18short == "AcFood",], alt = "two.sided", conf.int = T)
##
## Wilcoxon rank sum test with continuity correction
##
## data: HRLYEARN by SEX
## W = 1550054, p-value = 0.008176
## alternative hypothesis: true location shift is not equal to 0
## 95 percent confidence interval:
## 4.414496e-05 4.999908e-01
## sample estimates:
## difference in location
## 0.14998
wilcox.test(HRLYEARN ~ SEX, data = data.all[data.all$SURVYEAR == 2019 & data.all$NAICS_18short == "Other",], alt = "two.sided", conf.int = T)
##
## Wilcoxon rank sum test with continuity correction
##
## data: HRLYEARN by SEX
## W = 456494, p-value < 2.2e-16
## alternative hypothesis: true location shift is not equal to 0
## 95 percent confidence interval:
## 2.819933 4.359998
## sample estimates:
## difference in location
## 3.500032
wilcox.test(HRLYEARN ~ SEX, data = data.all[data.all$SURVYEAR == 2019 & data.all$NAICS_18short == "PubAd",], alt = "two.sided", conf.int = T)
##
## Wilcoxon rank sum test with continuity correction
##
## data: HRLYEARN by SEX
## W = 1873268, p-value < 2.2e-16
## alternative hypothesis: true location shift is not equal to 0
## 95 percent confidence interval:
## 3.380016 5.049965
## sample estimates:
## difference in location
## 4.219966
# Boxplot Analysis
ind.order.19 <- with(data.all[data.all$SURVYEAR == 2019,], reorder(NAICS_18short, HRLYEARN, median))
par(mar=c(16,5,2,1))
wage.ind.19 <- boxplot(HRLYEARN ~ SEX * ind.order.19
, data = data.all[data.all$SURVYEAR == 2019,]
, boxwex = 0.4
, ylim = c(0, 120)
, las = 2
, cex.axis = 1.0
, col = c("royalblue", "darkorange")
, xaxt = "n"
, xlab = ""
, ylab = "Hourly Wages (HRLYEARN)"
, main = "Hourly Wage by Industry - 2019 Gender Comparison")
mtext("Industry (NAICS_18)", side=1, line=4.3)
# Label of X Axis
axis(1
, at = seq(1.5, 36, 2)
, labels = levels(ind.order.19)
, tick=FALSE
, cex=0.3
, las = 2)
# Grey Vertical Lines
for(i in seq(0.5, 40, 2)){
abline(v=i,lty=1, col="grey")
}
# Add a legend
legend("topleft", legend = c("Males 2019", "Females 2019"),
col=c("royalblue", "darkorange"),
pch = 15, bty = "n", pt.cex = 2, cex = 1.0, horiz = T, inset = c(0.01, 0.01))
wage.ind.19$stats
## [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10] [,11] [,12]
## [1,] 6.92 7.41 3.040 8.25 3.30 5.770 4.56 5.26 3.000 4.81 4.62 5.00
## [2,] 13.15 13.00 14.000 13.50 15.00 14.000 15.00 14.77 18.000 15.00 16.00 15.00
## [3,] 15.00 14.50 16.750 15.00 18.47 16.000 18.50 18.00 23.080 19.17 23.05 20.00
## [4,] 17.61 16.86 24.855 19.00 23.08 20.875 25.00 23.67 30.965 27.40 34.07 28.35
## [5,] 24.18 22.50 41.080 27.25 35.00 30.000 40.00 36.54 50.000 45.05 60.22 48.21
## [,13] [,14] [,15] [,16] [,17] [,18] [,19] [,20] [,21] [,22] [,23] [,24]
## [1,] 3.53 10.50 5.00 3.070 4.81 3.50 5.29 6.92 9.62 10.30 3.53 4.360
## [2,] 18.50 15.34 19.35 17.465 20.00 19.00 20.00 18.03 20.00 17.50 21.00 19.975
## [3,] 25.00 19.75 25.00 21.720 26.25 24.04 26.00 22.50 26.00 22.09 29.77 25.640
## [4,] 34.00 26.00 32.00 27.295 37.90 35.00 35.00 29.81 36.00 28.85 42.31 34.055
## [5,] 56.54 41.03 50.96 42.000 63.37 59.00 57.50 46.77 58.50 45.67 72.92 54.950
## [,25] [,26] [,27] [,28] [,29] [,30] [,31] [,32] [,33] [,34] [,35]
## [1,] 8.170 6.07 5.77 3.13 3.25 3.48 5.13 4.810 3.21 4.730 14.000
## [2,] 22.000 18.00 25.00 20.00 24.22 23.00 28.00 25.295 27.78 23.875 35.000
## [3,] 28.745 23.00 35.00 26.44 36.06 30.00 37.00 31.370 37.00 34.000 43.475
## [4,] 37.000 28.90 46.67 35.49 48.08 42.31 47.00 41.325 49.04 45.095 52.000
## [5,] 58.000 45.00 78.85 58.00 82.05 71.15 75.00 65.000 80.77 72.120 76.920
## [,36]
## [1,] 16.07
## [2,] 27.85
## [3,] 37.00
## [4,] 46.15
## [5,] 72.82
# Males v. Females #############################################################
# OCCUPATION ###################################################################
# Mann Whitney U Tests
wilcox.test(HRLYEARN ~ SEX, data = data.all[data.all$SURVYEAR == 2019 & data.all$NOC_10short == "Mngt",], alt = "two.sided", conf.int = T)
##
## Wilcoxon rank sum test with continuity correction
##
## data: HRLYEARN by SEX
## W = 1268072, p-value < 2.2e-16
## alternative hypothesis: true location shift is not equal to 0
## 95 percent confidence interval:
## 4.809938 7.350044
## sample estimates:
## difference in location
## 6.080045
wilcox.test(HRLYEARN ~ SEX, data = data.all[data.all$SURVYEAR == 2019 & data.all$NOC_10short == "NatASc",], alt = "two.sided", conf.int = T)
##
## Wilcoxon rank sum test with continuity correction
##
## data: HRLYEARN by SEX
## W = 1290381, p-value = 2.365e-08
## alternative hypothesis: true location shift is not equal to 0
## 95 percent confidence interval:
## 1.950024 4.000029
## sample estimates:
## difference in location
## 2.999974
wilcox.test(HRLYEARN ~ SEX, data = data.all[data.all$SURVYEAR == 2019 & data.all$NOC_10short == "EduLaw",], alt = "two.sided", conf.int = T)
##
## Wilcoxon rank sum test with continuity correction
##
## data: HRLYEARN by SEX
## W = 5310381, p-value < 2.2e-16
## alternative hypothesis: true location shift is not equal to 0
## 95 percent confidence interval:
## 6.639996 8.200009
## sample estimates:
## difference in location
## 7.430066
wilcox.test(HRLYEARN ~ SEX, data = data.all[data.all$SURVYEAR == 2019 & data.all$NOC_10short == "Health",], alt = "two.sided", conf.int = T)
##
## Wilcoxon rank sum test with continuity correction
##
## data: HRLYEARN by SEX
## W = 1140716, p-value = 0.01428
## alternative hypothesis: true location shift is not equal to 0
## 95 percent confidence interval:
## 0.1500462 2.1200176
## sample estimates:
## difference in location
## 1.150062
wilcox.test(HRLYEARN ~ SEX, data = data.all[data.all$SURVYEAR == 2019 & data.all$NOC_10short == "Trades",], alt = "two.sided", conf.int = T)
##
## Wilcoxon rank sum test with continuity correction
##
## data: HRLYEARN by SEX
## W = 2803271, p-value < 2.2e-16
## alternative hypothesis: true location shift is not equal to 0
## 95 percent confidence interval:
## 4.080045 5.749970
## sample estimates:
## difference in location
## 4.999944
wilcox.test(HRLYEARN ~ SEX, data = data.all[data.all$SURVYEAR == 2019 & data.all$NOC_10short == "BusFin",], alt = "two.sided", conf.int = T)
##
## Wilcoxon rank sum test with continuity correction
##
## data: HRLYEARN by SEX
## W = 7312384, p-value < 2.2e-16
## alternative hypothesis: true location shift is not equal to 0
## 95 percent confidence interval:
## 2.130060 3.169963
## sample estimates:
## difference in location
## 2.709973
wilcox.test(HRLYEARN ~ SEX, data = data.all[data.all$SURVYEAR == 2019 & data.all$NOC_10short == "ArtCul",], alt = "two.sided", conf.int = T)
##
## Wilcoxon rank sum test with continuity correction
##
## data: HRLYEARN by SEX
## W = 104442, p-value = 0.0001424
## alternative hypothesis: true location shift is not equal to 0
## 95 percent confidence interval:
## 1.000031 3.500068
## sample estimates:
## difference in location
## 2.219955
wilcox.test(HRLYEARN ~ SEX, data = data.all[data.all$SURVYEAR == 2019 & data.all$NOC_10short == "ManUtil",], alt = "two.sided", conf.int = T)
##
## Wilcoxon rank sum test with continuity correction
##
## data: HRLYEARN by SEX
## W = 903084, p-value < 2.2e-16
## alternative hypothesis: true location shift is not equal to 0
## 95 percent confidence interval:
## 4.500042 5.949957
## sample estimates:
## difference in location
## 5.050023
wilcox.test(HRLYEARN ~ SEX, data = data.all[data.all$SURVYEAR == 2019 & data.all$NOC_10short == "NatAgri",], alt = "two.sided", conf.int = T)
##
## Wilcoxon rank sum test with continuity correction
##
## data: HRLYEARN by SEX
## W = 269920, p-value < 2.2e-16
## alternative hypothesis: true location shift is not equal to 0
## 95 percent confidence interval:
## 3.500010 5.710072
## sample estimates:
## difference in location
## 4.619989
wilcox.test(HRLYEARN ~ SEX, data = data.all[data.all$SURVYEAR == 2019 & data.all$NOC_10short == "Sales",], alt = "two.sided", conf.int = T)
##
## Wilcoxon rank sum test with continuity correction
##
## data: HRLYEARN by SEX
## W = 22827094, p-value < 2.2e-16
## alternative hypothesis: true location shift is not equal to 0
## 95 percent confidence interval:
## 0.9999802 1.1500083
## sample estimates:
## difference in location
## 1.000046
#Boxplot Analysis
occu.order.19 <- with(data.all[data.all$SURVYEAR == 2019,], reorder(NOC_10short, HRLYEARN, median))
par(mar=c(16,5,2,1))
wage.occu.19 <- boxplot(HRLYEARN ~ SEX * occu.order.19
, data = data.all[data.all$SURVYEAR == 2019,]
, boxwex = 0.4
, ylim = c(0, 120)
, las = 2
, cex.axis = 1.0
, col = c("royalblue", "darkorange")
, xaxt = "n"
, xlab = ""
, ylab = "Hourly Wages (HRLYEARN)"
, main = "Hourly Wage by Occupation - 2019 Gender Comparison")
# Label of X Axis
axis(1
, at = seq(1.5, 20, 2)
, labels = levels(occu.order.19)
, tick=FALSE
, cex=0.3
, las = 2)
# Grey Vertical Lines
for(i in seq(0.5, 20, 2)){
abline(v=i,lty=1, col="grey")
}
# Add a legend
legend("topleft", legend = c("Males 2019", "Females 2019"),
col=c("royalblue", "darkorange"),
pch = 15, bty = "n", pt.cex = 2, cex = 1.0, horiz = T, inset = c(0.01, 0.01))
wage.occu.19$stats
## [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10] [,11] [,12]
## [1,] 3.04 5.00 3.21 5.26 6.92 10.50 9.85 5.42 4.62 3.07 3.460 4.55
## [2,] 14.00 13.75 17.00 14.50 18.00 15.00 17.50 16.00 20.19 19.49 20.000 16.54
## [3,] 16.50 15.00 23.00 17.00 23.36 17.50 23.50 21.00 27.00 24.34 26.000 20.00
## [4,] 23.00 19.79 33.00 23.97 32.00 21.45 31.25 27.89 37.50 30.53 34.625 26.00
## [5,] 36.13 28.72 57.00 38.00 53.00 31.00 51.00 45.64 63.46 47.00 56.500 40.15
## [,13] [,14] [,15] [,16] [,17] [,18] [,19] [,20]
## [1,] 10.00 5.05 3.00 3.48 5.13 6.25 3.30 3.30
## [2,] 21.00 21.00 26.25 20.00 26.62 24.52 32.88 26.92
## [3,] 30.00 27.79 37.50 27.88 36.06 33.64 45.00 38.46
## [4,] 40.49 40.00 48.08 39.90 46.63 43.00 57.69 52.88
## [5,] 68.68 67.31 79.91 69.71 76.51 70.62 94.87 91.35
# Normalize numeric variables
normalize <- function(x) {
return ((x - min(x)) / (max(x) - min(x))) }
data.all.n <- as.data.frame(lapply(data.all[num.vars], normalize))
summary(data.all.n)
## UTOTHRS TENURE HRLYEARN
## Min. :0.0000 Min. :0.00000 Min. :0.0000
## 1st Qu.:0.3509 1st Qu.:0.05858 1st Qu.:0.1147
## Median :0.4016 Median :0.21757 Median :0.1676
## Mean :0.3634 Mean :0.35573 Mean :0.1973
## 3rd Qu.:0.4016 3rd Qu.:0.59414 3rd Qu.:0.2537
## Max. :1.0000 Max. :1.00000 Max. :1.0000
`%nin%` = Negate(`%in%`)
data.all.n <- cbind(data.all[names(data.all) %nin% num.vars], data.all.n)
# TRAINING (70%) AND TESTING (30%) SETS
data.all.n09male <- data.all.n[data.all.n$SURVYEAR == 2009 & data.all.n$SEX == "Male",]
data.all.n09fem <- data.all.n[data.all.n$SURVYEAR == 2009 & data.all.n$SEX == "Female",]
data.all.n19male <- data.all.n[data.all.n$SURVYEAR == 2019 & data.all.n$SEX == "Male",]
data.all.n19fem <- data.all.n[data.all.n$SURVYEAR == 2019 & data.all.n$SEX == "Female",]
# 2009 Males
set.seed(1)
idx.09male <- sample(1:nrow(data.all.n09male), floor(0.7*nrow(data.all.n09male)))
write.csv(idx.09male, file = "Train_Idx_09Male.csv", row.names=FALSE)
train.09male <- data.all.n09male[idx.09male,]
test.09male <- data.all.n09male[-idx.09male,]
# 2009 Females
set.seed(30)
idx.09fem <- sample(1:nrow(data.all.n09fem), floor(0.7*nrow(data.all.n09fem)))
write.csv(idx.09fem, file = "Train_Idx_09Fem.csv", row.names=FALSE)
train.09fem <- data.all.n09fem[idx.09fem,]
test.09fem <- data.all.n09fem[-idx.09fem,]
# 2019 Males
set.seed(500)
idx.19male <- sample(1:nrow(data.all.n19male), floor(0.7*nrow(data.all.n19male)))
write.csv(idx.19male, file = "Train_Idx_19Male.csv", row.names=FALSE)
train.19male <- data.all.n19male[idx.19male,]
test.19male <- data.all.n19male[-idx.19male,]
# 2019 Females
set.seed(7000)
idx.19fem <- sample(1:nrow(data.all.n19fem), floor(0.7*nrow(data.all.n19fem)))
write.csv(idx.19fem, file = "Train_Idx_19Fem.csv", row.names=FALSE)
train.19fem <- data.all.n19fem[idx.19fem,]
test.19fem <- data.all.n19fem[-idx.19fem,]
# Males 2009
# Model
names(data.all)
## [1] "REC_NUM" "SURVYEAR" "SURVMNTH" "LFSSTAT"
## [5] "PROV" "CMA" "AGE_12" "SEX"
## [9] "MARSTAT" "EDUC" "MJH" "COWMAIN"
## [13] "IMMIG" "NAICS_18" "NOC_10" "NOC_40"
## [17] "FTPTMAIN" "UTOTHRS" "TENURE" "HRLYEARN"
## [21] "UNION" "PERMTEMP" "ESTSIZE" "FIRMSIZE"
## [25] "SCHOOLN" "EFAMTYPE" "AGYOWNK" "EDUCshort"
## [29] "NAICS_18short" "NOC_10short"
full <- lm(HRLYEARN ~ LFSSTAT+PROV+CMA+AGE_12+MARSTAT+EDUC+MJH+COWMAIN
+NAICS_18+NOC_10+FTPTMAIN+UTOTHRS+TENURE+UNION+PERMTEMP
+ESTSIZE+FIRMSIZE+SCHOOLN+EFAMTYPE+AGYOWNK, data = train.09male)
null <- lm(HRLYEARN ~ 1, data = train.09male)
model <- stepAIC(null, scope=list(lower=null, upper=full), direction= "both", trace=F) # Trace TRUE to show steps of adding and subtracting vars
summary(model)
##
## Call:
## lm(formula = HRLYEARN ~ NOC_10 + NAICS_18 + AGE_12 + PROV + EDUC +
## TENURE + ESTSIZE + FIRMSIZE + AGYOWNK + PERMTEMP + FTPTMAIN +
## UTOTHRS + UNION + MARSTAT + LFSSTAT + COWMAIN, data = train.09male)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.34761 -0.04505 -0.00701 0.03390 0.64416
##
## Coefficients:
## Estimate Std. Error t value
## (Intercept) 0.2573064 0.0080378 32.012
## NOC_10Business, finance & administration -0.0980386 0.0028446 -34.464
## NOC_10Natural & applied sciences -0.0530643 0.0028425 -18.668
## NOC_10Health -0.0357390 0.0052941 -6.751
## NOC_10Educ., law, community & gov. services -0.0479927 0.0037356 -12.847
## NOC_10Art, culture, recreation & sport -0.0764742 0.0054402 -14.057
## NOC_10Sales & service -0.1004194 0.0025816 -38.898
## NOC_10Trades, transport & equipm. operators -0.0931347 0.0025773 -36.136
## NOC_10Natural resources & agriculture -0.0958967 0.0043085 -22.257
## NOC_10Manufacturing & utilities -0.1145789 0.0033682 -34.018
## NAICS_18Forestry, Fishing, Min., Oil & Gas 0.0545069 0.0052637 10.355
## NAICS_18Utilities 0.0634565 0.0071919 8.823
## NAICS_18Construction 0.0418910 0.0056413 7.426
## NAICS_18Manufacturing durables 0.0180903 0.0057355 3.154
## NAICS_18Manufacturing non-durables 0.0110890 0.0058769 1.887
## NAICS_18Wholesale Trade 0.0153245 0.0059535 2.574
## NAICS_18Retail Trade -0.0166003 0.0056865 -2.919
## NAICS_18Transportation & Warehousing 0.0059642 0.0058085 1.027
## NAICS_18Finance, Insurance, Real Est. & Leas. 0.0223155 0.0061734 3.615
## NAICS_18Prof., Scientific & Technical Services 0.0444630 0.0060855 7.306
## NAICS_18Management, Admin. & Support -0.0146452 0.0059094 -2.478
## NAICS_18Educational Services 0.0075440 0.0066166 1.140
## NAICS_18Health Care & Social Assistance -0.0170636 0.0066286 -2.574
## NAICS_18Information, Culture & Recreation 0.0037343 0.0060453 0.618
## NAICS_18Accommodation & Food Services -0.0288712 0.0060149 -4.800
## NAICS_18Other Services 0.0006623 0.0061136 0.108
## NAICS_18Public Administration 0.0468728 0.0062819 7.462
## AGE_12.L 0.0125046 0.0047295 2.644
## AGE_12.Q -0.0529037 0.0044881 -11.788
## AGE_12.C -0.0193159 0.0038943 -4.960
## AGE_12^4 -0.0058821 0.0035415 -1.661
## AGE_12^5 -0.0112318 0.0032448 -3.462
## AGE_12^6 0.0032615 0.0029043 1.123
## AGE_12^7 -0.0034502 0.0025315 -1.363
## AGE_12^8 0.0052577 0.0021961 2.394
## AGE_12^9 0.0028232 0.0019463 1.451
## AGE_12^10 0.0042481 0.0017737 2.395
## AGE_12^11 0.0004379 0.0016550 0.265
## PROVPEI -0.0195493 0.0047748 -4.094
## PROVNS -0.0116356 0.0040612 -2.865
## PROVNB -0.0099912 0.0039832 -2.508
## PROVQC 0.0069783 0.0034623 2.015
## PROVON 0.0214436 0.0033285 6.442
## PROVMB 0.0004315 0.0036292 0.119
## PROVSK 0.0250554 0.0037888 6.613
## PROVAB 0.0479803 0.0035522 13.507
## PROVBC 0.0316994 0.0035888 8.833
## EDUC.L 0.0632672 0.0028592 22.127
## EDUC.Q 0.0166378 0.0025775 6.455
## EDUC.C 0.0062549 0.0020482 3.054
## EDUC^4 -0.0063464 0.0018846 -3.368
## EDUC^5 -0.0038765 0.0013888 -2.791
## EDUC^6 -0.0024849 0.0015972 -1.556
## TENURE 0.0350170 0.0019904 17.593
## ESTSIZE.L 0.0208994 0.0017118 12.209
## ESTSIZE.Q 0.0046253 0.0014286 3.238
## ESTSIZE.C 0.0012085 0.0012716 0.950
## FIRMSIZE.L 0.0145086 0.0015441 9.396
## FIRMSIZE.Q -0.0002057 0.0014856 -0.138
## FIRMSIZE.C 0.0004759 0.0015209 0.313
## AGYOWNK.L -0.0076283 0.0017404 -4.383
## AGYOWNK.Q -0.0033694 0.0018326 -1.839
## AGYOWNK.C -0.0011279 0.0019759 -0.571
## AGYOWNK^4 -0.0065809 0.0020133 -3.269
## PERMTEMPTemp. season -0.0172820 0.0026052 -6.634
## PERMTEMPTemp. contract -0.0123026 0.0025365 -4.850
## PERMTEMPTemp. casual -0.0136647 0.0037091 -3.684
## FTPTMAINPart-time -0.0262977 0.0027109 -9.701
## UTOTHRS -0.0529196 0.0072706 -7.279
## UNIONNot member but covered 0.0067830 0.0039039 1.738
## UNIONNon-unionized -0.0066323 0.0015134 -4.382
## MARSTATCommon-law -0.0015794 0.0018519 -0.853
## MARSTATWidowed -0.0005856 0.0081516 -0.072
## MARSTATSeparated 0.0026465 0.0037717 0.702
## MARSTATDivorced -0.0027376 0.0031783 -0.861
## MARSTATSingle, NM -0.0096063 0.0019417 -4.947
## LFSSTATEmployed, absent from work -0.0062774 0.0024873 -2.524
## COWMAINPrivate sector 0.0055096 0.0027263 2.021
## Pr(>|t|)
## (Intercept) < 2e-16 ***
## NOC_10Business, finance & administration < 2e-16 ***
## NOC_10Natural & applied sciences < 2e-16 ***
## NOC_10Health 1.51e-11 ***
## NOC_10Educ., law, community & gov. services < 2e-16 ***
## NOC_10Art, culture, recreation & sport < 2e-16 ***
## NOC_10Sales & service < 2e-16 ***
## NOC_10Trades, transport & equipm. operators < 2e-16 ***
## NOC_10Natural resources & agriculture < 2e-16 ***
## NOC_10Manufacturing & utilities < 2e-16 ***
## NAICS_18Forestry, Fishing, Min., Oil & Gas < 2e-16 ***
## NAICS_18Utilities < 2e-16 ***
## NAICS_18Construction 1.17e-13 ***
## NAICS_18Manufacturing durables 0.001612 **
## NAICS_18Manufacturing non-durables 0.059191 .
## NAICS_18Wholesale Trade 0.010060 *
## NAICS_18Retail Trade 0.003513 **
## NAICS_18Transportation & Warehousing 0.304528
## NAICS_18Finance, Insurance, Real Est. & Leas. 0.000301 ***
## NAICS_18Prof., Scientific & Technical Services 2.85e-13 ***
## NAICS_18Management, Admin. & Support 0.013210 *
## NAICS_18Educational Services 0.254235
## NAICS_18Health Care & Social Assistance 0.010054 *
## NAICS_18Information, Culture & Recreation 0.536771
## NAICS_18Accommodation & Food Services 1.60e-06 ***
## NAICS_18Other Services 0.913735
## NAICS_18Public Administration 8.92e-14 ***
## AGE_12.L 0.008200 **
## AGE_12.Q < 2e-16 ***
## AGE_12.C 7.11e-07 ***
## AGE_12^4 0.096747 .
## AGE_12^5 0.000538 ***
## AGE_12^6 0.261464
## AGE_12^7 0.172931
## AGE_12^8 0.016670 *
## AGE_12^9 0.146917
## AGE_12^10 0.016627 *
## AGE_12^11 0.791338
## PROVPEI 4.25e-05 ***
## PROVNS 0.004174 **
## PROVNB 0.012139 *
## PROVQC 0.043867 *
## PROVON 1.20e-10 ***
## PROVMB 0.905348
## PROVSK 3.87e-11 ***
## PROVAB < 2e-16 ***
## PROVBC < 2e-16 ***
## EDUC.L < 2e-16 ***
## EDUC.Q 1.11e-10 ***
## EDUC.C 0.002263 **
## EDUC^4 0.000760 ***
## EDUC^5 0.005256 **
## EDUC^6 0.119776
## TENURE < 2e-16 ***
## ESTSIZE.L < 2e-16 ***
## ESTSIZE.Q 0.001208 **
## ESTSIZE.C 0.341907
## FIRMSIZE.L < 2e-16 ***
## FIRMSIZE.Q 0.889889
## FIRMSIZE.C 0.754376
## AGYOWNK.L 1.18e-05 ***
## AGYOWNK.Q 0.065994 .
## AGYOWNK.C 0.568126
## AGYOWNK^4 0.001082 **
## PERMTEMPTemp. season 3.37e-11 ***
## PERMTEMPTemp. contract 1.24e-06 ***
## PERMTEMPTemp. casual 0.000230 ***
## FTPTMAINPart-time < 2e-16 ***
## UTOTHRS 3.51e-13 ***
## UNIONNot member but covered 0.082314 .
## UNIONNon-unionized 1.18e-05 ***
## MARSTATCommon-law 0.393758
## MARSTATWidowed 0.942728
## MARSTATSeparated 0.482893
## MARSTATDivorced 0.389067
## MARSTATSingle, NM 7.58e-07 ***
## LFSSTATEmployed, absent from work 0.011620 *
## COWMAINPrivate sector 0.043304 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.0771 on 18781 degrees of freedom
## Multiple R-squared: 0.4733, Adjusted R-squared: 0.4711
## F-statistic: 219.1 on 77 and 18781 DF, p-value: < 2.2e-16
# Investigating SCHOOLN Coefficient NA
str(data.all$SCHOOLN)
## Factor w/ 4 levels "Non-student",..: 1 1 1 1 1 1 1 1 1 1 ...
nrow(train.09male[train.09male$SCHOOLN=="Unknown",])
## [1] 407
# Diagnostic Plots
# par(mfrow = c(2, 2))
par(mar=c(5.1, 4.1, 4.1, 2.1))
plot(model, lwd = 6)
# Box Cox Transformation (Selected attributes from Stepwise Regression)
bc.model <- lm(formula = HRLYEARN ~ NOC_10 + NAICS_18 + AGE_12 + PROV + EDUC +
TENURE + ESTSIZE + FIRMSIZE + AGYOWNK + PERMTEMP + FTPTMAIN +
UTOTHRS + UNION + MARSTAT + LFSSTAT + COWMAIN
, data = train.09male)
bc <- boxcox(bc.model, lambda = seq(-3, 3))
bc$x[which(bc$y==max(bc$y))]
## [1] 0.2121212
new.model <- lm(formula = (HRLYEARN)^0.21 ~ NOC_10 + NAICS_18 + AGE_12 + PROV + EDUC +
TENURE + ESTSIZE + AGYOWNK + FIRMSIZE + PERMTEMP + FTPTMAIN +
UTOTHRS + MARSTAT + UNION + LFSSTAT + MJH, data = train.09male)
summary(new.model)
##
## Call:
## lm(formula = (HRLYEARN)^0.21 ~ NOC_10 + NAICS_18 + AGE_12 + PROV +
## EDUC + TENURE + ESTSIZE + AGYOWNK + FIRMSIZE + PERMTEMP +
## FTPTMAIN + UTOTHRS + MARSTAT + UNION + LFSSTAT + MJH, data = train.09male)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.39703 -0.03265 -0.00008 0.03233 0.24573
##
## Coefficients:
## Estimate Std. Error t value
## (Intercept) 0.7177075 0.0055964 128.244
## NOC_10Business, finance & administration -0.0622680 0.0020346 -30.604
## NOC_10Natural & applied sciences -0.0302886 0.0020329 -14.899
## NOC_10Health -0.0205009 0.0037842 -5.417
## NOC_10Educ., law, community & gov. services -0.0301744 0.0026716 -11.294
## NOC_10Art, culture, recreation & sport -0.0436001 0.0038923 -11.202
## NOC_10Sales & service -0.0690683 0.0018462 -37.411
## NOC_10Trades, transport & equipm. operators -0.0571677 0.0018431 -31.018
## NOC_10Natural resources & agriculture -0.0583828 0.0030814 -18.947
## NOC_10Manufacturing & utilities -0.0756620 0.0024086 -31.414
## NAICS_18Forestry, Fishing, Min., Oil & Gas 0.0491871 0.0037648 13.065
## NAICS_18Utilities 0.0531426 0.0049322 10.775
## NAICS_18Construction 0.0464192 0.0040344 11.506
## NAICS_18Manufacturing durables 0.0289088 0.0041007 7.050
## NAICS_18Manufacturing non-durables 0.0211285 0.0042017 5.029
## NAICS_18Wholesale Trade 0.0257190 0.0042575 6.041
## NAICS_18Retail Trade -0.0022691 0.0040668 -0.558
## NAICS_18Transportation & Warehousing 0.0149749 0.0041431 3.614
## NAICS_18Finance, Insurance, Real Est. & Leas. 0.0267711 0.0044145 6.064
## NAICS_18Prof., Scientific & Technical Services 0.0455705 0.0043521 10.471
## NAICS_18Management, Admin. & Support -0.0026267 0.0042262 -0.622
## NAICS_18Educational Services 0.0177953 0.0044631 3.987
## NAICS_18Health Care & Social Assistance 0.0014326 0.0046158 0.310
## NAICS_18Information, Culture & Recreation 0.0137391 0.0043174 3.182
## NAICS_18Accommodation & Food Services -0.0137698 0.0043017 -3.201
## NAICS_18Other Services 0.0136806 0.0043723 3.129
## NAICS_18Public Administration 0.0438451 0.0041542 10.554
## AGE_12.L 0.0128548 0.0033823 3.801
## AGE_12.Q -0.0527933 0.0032098 -16.447
## AGE_12.C -0.0049210 0.0027853 -1.767
## AGE_12^4 -0.0109511 0.0025330 -4.323
## AGE_12^5 -0.0080144 0.0023207 -3.453
## AGE_12^6 0.0030622 0.0020772 1.474
## AGE_12^7 -0.0042250 0.0018107 -2.333
## AGE_12^8 0.0039378 0.0015706 2.507
## AGE_12^9 0.0011558 0.0013919 0.830
## AGE_12^10 0.0022889 0.0012685 1.804
## AGE_12^11 -0.0003042 0.0011836 -0.257
## PROVPEI -0.0152036 0.0034159 -4.451
## PROVNS -0.0074142 0.0029051 -2.552
## PROVNB -0.0069053 0.0028496 -2.423
## PROVQC 0.0077757 0.0024785 3.137
## PROVON 0.0177324 0.0023829 7.442
## PROVMB 0.0038445 0.0025985 1.479
## PROVSK 0.0227459 0.0027121 8.387
## PROVAB 0.0392619 0.0025429 15.440
## PROVBC 0.0271287 0.0025688 10.561
## EDUC.L 0.0415791 0.0020446 20.336
## EDUC.Q 0.0043983 0.0018435 2.386
## EDUC.C 0.0024609 0.0014646 1.680
## EDUC^4 -0.0054203 0.0013480 -4.021
## EDUC^5 -0.0026859 0.0009933 -2.704
## EDUC^6 -0.0005705 0.0011422 -0.499
## TENURE 0.0284930 0.0014218 20.041
## ESTSIZE.L 0.0150023 0.0012232 12.265
## ESTSIZE.Q 0.0025721 0.0010208 2.520
## ESTSIZE.C 0.0009307 0.0009092 1.024
## AGYOWNK.L -0.0054457 0.0012447 -4.375
## AGYOWNK.Q -0.0020668 0.0013108 -1.577
## AGYOWNK.C -0.0007248 0.0014131 -0.513
## AGYOWNK^4 -0.0035055 0.0014398 -2.435
## FIRMSIZE.L 0.0099756 0.0011033 9.042
## FIRMSIZE.Q -0.0006254 0.0010624 -0.589
## FIRMSIZE.C 0.0003912 0.0010877 0.360
## PERMTEMPTemp. season -0.0160308 0.0018627 -8.606
## PERMTEMPTemp. contract -0.0097256 0.0018138 -5.362
## PERMTEMPTemp. casual -0.0131244 0.0026522 -4.949
## FTPTMAINPart-time -0.0257426 0.0020261 -12.706
## UTOTHRS -0.0320650 0.0056232 -5.702
## MARSTATCommon-law -0.0009476 0.0013244 -0.716
## MARSTATWidowed -0.0006996 0.0058299 -0.120
## MARSTATSeparated -0.0003618 0.0026973 -0.134
## MARSTATDivorced -0.0020224 0.0022730 -0.890
## MARSTATSingle, NM -0.0081825 0.0013886 -5.892
## UNIONNot member but covered 0.0003877 0.0027905 0.139
## UNIONNon-unionized -0.0107784 0.0010635 -10.135
## LFSSTATEmployed, absent from work -0.0034503 0.0017794 -1.939
## MJHMultiple jobholder -0.0029147 0.0022009 -1.324
## Pr(>|t|)
## (Intercept) < 2e-16 ***
## NOC_10Business, finance & administration < 2e-16 ***
## NOC_10Natural & applied sciences < 2e-16 ***
## NOC_10Health 6.12e-08 ***
## NOC_10Educ., law, community & gov. services < 2e-16 ***
## NOC_10Art, culture, recreation & sport < 2e-16 ***
## NOC_10Sales & service < 2e-16 ***
## NOC_10Trades, transport & equipm. operators < 2e-16 ***
## NOC_10Natural resources & agriculture < 2e-16 ***
## NOC_10Manufacturing & utilities < 2e-16 ***
## NAICS_18Forestry, Fishing, Min., Oil & Gas < 2e-16 ***
## NAICS_18Utilities < 2e-16 ***
## NAICS_18Construction < 2e-16 ***
## NAICS_18Manufacturing durables 1.85e-12 ***
## NAICS_18Manufacturing non-durables 4.99e-07 ***
## NAICS_18Wholesale Trade 1.56e-09 ***
## NAICS_18Retail Trade 0.576869
## NAICS_18Transportation & Warehousing 0.000302 ***
## NAICS_18Finance, Insurance, Real Est. & Leas. 1.35e-09 ***
## NAICS_18Prof., Scientific & Technical Services < 2e-16 ***
## NAICS_18Management, Admin. & Support 0.534263
## NAICS_18Educational Services 6.71e-05 ***
## NAICS_18Health Care & Social Assistance 0.756280
## NAICS_18Information, Culture & Recreation 0.001464 **
## NAICS_18Accommodation & Food Services 0.001372 **
## NAICS_18Other Services 0.001757 **
## NAICS_18Public Administration < 2e-16 ***
## AGE_12.L 0.000145 ***
## AGE_12.Q < 2e-16 ***
## AGE_12.C 0.077281 .
## AGE_12^4 1.54e-05 ***
## AGE_12^5 0.000555 ***
## AGE_12^6 0.140438
## AGE_12^7 0.019641 *
## AGE_12^8 0.012176 *
## AGE_12^9 0.406325
## AGE_12^10 0.071173 .
## AGE_12^11 0.797203
## PROVPEI 8.60e-06 ***
## PROVNS 0.010716 *
## PROVNB 0.015390 *
## PROVQC 0.001708 **
## PROVON 1.04e-13 ***
## PROVMB 0.139026
## PROVSK < 2e-16 ***
## PROVAB < 2e-16 ***
## PROVBC < 2e-16 ***
## EDUC.L < 2e-16 ***
## EDUC.Q 0.017048 *
## EDUC.C 0.092921 .
## EDUC^4 5.82e-05 ***
## EDUC^5 0.006856 **
## EDUC^6 0.617447
## TENURE < 2e-16 ***
## ESTSIZE.L < 2e-16 ***
## ESTSIZE.Q 0.011754 *
## ESTSIZE.C 0.306046
## AGYOWNK.L 1.22e-05 ***
## AGYOWNK.Q 0.114870
## AGYOWNK.C 0.608015
## AGYOWNK^4 0.014913 *
## FIRMSIZE.L < 2e-16 ***
## FIRMSIZE.Q 0.556072
## FIRMSIZE.C 0.719102
## PERMTEMPTemp. season < 2e-16 ***
## PERMTEMPTemp. contract 8.32e-08 ***
## PERMTEMPTemp. casual 7.54e-07 ***
## FTPTMAINPart-time < 2e-16 ***
## UTOTHRS 1.20e-08 ***
## MARSTATCommon-law 0.474309
## MARSTATWidowed 0.904488
## MARSTATSeparated 0.893291
## MARSTATDivorced 0.373613
## MARSTATSingle, NM 3.87e-09 ***
## UNIONNot member but covered 0.889507
## UNIONNon-unionized < 2e-16 ***
## LFSSTATEmployed, absent from work 0.052513 .
## MJHMultiple jobholder 0.185417
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.05514 on 18781 degrees of freedom
## Multiple R-squared: 0.529, Adjusted R-squared: 0.5271
## F-statistic: 273.9 on 77 and 18781 DF, p-value: < 2.2e-16
par(mar=c(5.1, 4.1, 4.1, 2.1))
plot(new.model, lwd = 6)
# Residuals Skewness
# Before Transformation:
skewness(model$residuals)
## [1] 1.105503
# After Transformation
skewness(new.model$residuals)
## [1] -0.1655788
# Prediction
prediction <- predict(new.model, interval = "prediction", newdata = test.09male)
# Errors
errors <- prediction[,"fit"] - (test.09male$HRLYEARN^0.21)
hist(errors)
rmse <- sqrt(sum((errors)^2)/nrow(test.09male))
mae <- (1/nrow(test.09male))*sum(abs(errors))
diff.percent <- 100*(abs(errors)/(test.09male$HRLYEARN^0.21))
diff.25 <- length(diff.percent[diff.percent<=25])/nrow(test.09male)
paste("RMSE:", rmse)
## [1] "RMSE: 0.0555053839803959"
paste("MAE:", mae)
## [1] "MAE: 0.0422321988304378"
paste("Percentage of cases with less than 25% error:", diff.25*100)
## [1] "Percentage of cases with less than 25% error: 99.0473833972535"
# Males 2019
# Model
full <- lm(HRLYEARN ~ LFSSTAT+PROV+CMA+AGE_12+MARSTAT+EDUC+MJH+COWMAIN
+NAICS_18+NOC_10+FTPTMAIN+UTOTHRS+TENURE+UNION+PERMTEMP
+ESTSIZE+FIRMSIZE+SCHOOLN+EFAMTYPE+AGYOWNK, data = train.19male)
null <- lm(HRLYEARN ~ 1, data = train.19male)
model <- stepAIC(null, scope=list(lower=null, upper=full), direction= "both", trace=F) # Trace TRUE to show steps of adding and subtracting vars
summary(model)
##
## Call:
## lm(formula = HRLYEARN ~ NOC_10 + NAICS_18 + AGE_12 + PROV + EDUC +
## TENURE + FIRMSIZE + MARSTAT + ESTSIZE + FTPTMAIN + UTOTHRS +
## PERMTEMP + EFAMTYPE + SCHOOLN + CMA + UNION + AGYOWNK + MJH +
## COWMAIN, data = train.19male)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.40507 -0.05420 -0.00902 0.04277 0.61524
##
## Coefficients: (1 not defined because of singularities)
## Estimate Std. Error t value
## (Intercept) 3.407e-01 1.131e-02 30.131
## NOC_10Business, finance & administration -1.203e-01 3.615e-03 -33.271
## NOC_10Natural & applied sciences -8.063e-02 3.482e-03 -23.158
## NOC_10Health -7.901e-02 6.164e-03 -12.816
## NOC_10Educ., law, community & gov. services -7.289e-02 4.118e-03 -17.702
## NOC_10Art, culture, recreation & sport -1.332e-01 6.782e-03 -19.639
## NOC_10Sales & service -1.559e-01 3.381e-03 -46.110
## NOC_10Trades, transport & equipm. operators -1.277e-01 3.247e-03 -39.315
## NOC_10Natural resources & agriculture -1.441e-01 5.212e-03 -27.650
## NOC_10Manufacturing & utilities -1.451e-01 4.223e-03 -34.358
## NAICS_18Forestry, Fishing, Min., Oil & Gas 9.717e-02 6.317e-03 15.383
## NAICS_18Utilities 1.156e-01 8.987e-03 12.863
## NAICS_18Construction 5.202e-02 6.828e-03 7.619
## NAICS_18Manufacturing durables 1.660e-02 7.026e-03 2.363
## NAICS_18Manufacturing non-durables 1.504e-02 7.227e-03 2.081
## NAICS_18Wholesale Trade 3.235e-02 7.299e-03 4.432
## NAICS_18Retail Trade -1.047e-02 6.901e-03 -1.516
## NAICS_18Transportation & Warehousing 1.163e-02 7.012e-03 1.658
## NAICS_18Finance, Insurance, Real Est. & Leas. 3.876e-02 7.369e-03 5.261
## NAICS_18Prof., Scientific & Technical Services 4.925e-02 7.220e-03 6.821
## NAICS_18Management, Admin. & Support -1.136e-04 7.144e-03 -0.016
## NAICS_18Educational Services 2.013e-02 7.980e-03 2.523
## NAICS_18Health Care & Social Assistance -1.633e-02 7.860e-03 -2.078
## NAICS_18Information, Culture & Recreation 7.651e-03 7.320e-03 1.045
## NAICS_18Accommodation & Food Services -1.523e-02 7.354e-03 -2.071
## NAICS_18Other Services 1.465e-02 7.517e-03 1.949
## NAICS_18Public Administration 4.074e-02 7.652e-03 5.325
## AGE_12.L -1.319e-03 5.273e-03 -0.250
## AGE_12.Q -3.673e-02 4.741e-03 -7.748
## AGE_12.C -1.642e-02 3.870e-03 -4.242
## AGE_12^4 6.800e-03 3.452e-03 1.970
## AGE_12^5 -1.105e-02 3.169e-03 -3.485
## AGE_12^6 -3.398e-04 2.906e-03 -0.117
## AGE_12^7 -6.130e-03 2.628e-03 -2.333
## AGE_12^8 -4.770e-03 2.373e-03 -2.010
## AGE_12^9 -9.932e-05 2.193e-03 -0.045
## AGE_12^10 7.281e-04 2.107e-03 0.346
## AGE_12^11 2.274e-03 2.091e-03 1.087
## PROVPEI -2.745e-02 5.707e-03 -4.811
## PROVNS -2.001e-02 4.921e-03 -4.065
## PROVNB -2.706e-02 4.925e-03 -5.494
## PROVQC -1.403e-03 4.290e-03 -0.327
## PROVON 1.369e-02 4.109e-03 3.331
## PROVMB -7.232e-04 4.542e-03 -0.159
## PROVSK 1.979e-02 4.611e-03 4.292
## PROVAB 5.475e-02 4.358e-03 12.564
## PROVBC 2.868e-02 4.494e-03 6.382
## EDUC.L 6.938e-02 3.600e-03 19.273
## EDUC.Q 2.242e-02 3.325e-03 6.743
## EDUC.C 6.487e-03 2.621e-03 2.476
## EDUC^4 -3.277e-03 2.519e-03 -1.301
## EDUC^5 2.334e-04 1.750e-03 0.133
## EDUC^6 1.265e-03 2.251e-03 0.562
## TENURE 4.989e-02 2.429e-03 20.541
## FIRMSIZE.L 2.089e-02 1.935e-03 10.795
## FIRMSIZE.Q -1.287e-03 1.793e-03 -0.718
## FIRMSIZE.C 1.915e-03 1.815e-03 1.055
## MARSTATCommon-law -4.560e-03 2.120e-03 -2.151
## MARSTATWidowed -1.986e-02 1.006e-02 -1.975
## MARSTATSeparated 2.847e-03 5.607e-03 0.508
## MARSTATDivorced 7.375e-03 4.861e-03 1.517
## MARSTATSingle, NM -1.254e-02 3.485e-03 -3.600
## ESTSIZE.L 2.187e-02 2.073e-03 10.549
## ESTSIZE.Q 8.261e-03 1.716e-03 4.813
## ESTSIZE.C -1.445e-03 1.525e-03 -0.948
## FTPTMAINPart-time -2.867e-02 3.536e-03 -8.108
## UTOTHRS -4.967e-02 9.622e-03 -5.163
## PERMTEMPTemp. season -1.753e-02 3.422e-03 -5.123
## PERMTEMPTemp. contract -5.953e-03 3.076e-03 -1.936
## PERMTEMPTemp. casual -1.216e-02 4.287e-03 -2.837
## EFAMTYPEHWDENC 4.484e-03 3.668e-03 1.223
## EFAMTYPEHWDE17 -1.599e-03 4.888e-03 -0.327
## EFAMTYPEHWDE24 -7.134e-03 4.590e-03 -1.554
## EFAMTYPEHWSHNC 1.056e-02 4.210e-03 2.509
## EFAMTYPEHWSH17 1.851e-03 5.532e-03 0.335
## EFAMTYPEHWSH24 5.838e-03 7.291e-03 0.801
## EFAMTYPEHWSWNC -1.848e-03 1.011e-02 -0.183
## EFAMTYPEHWSW17 -2.988e-03 1.605e-02 -0.186
## EFAMTYPEHWSW24 3.075e-03 1.260e-02 0.244
## EFAMTYPEHWNENC -1.652e-02 7.659e-03 -2.157
## EFAMTYPEHWNE17 -1.569e-02 2.228e-02 -0.704
## EFAMTYPEHWNE24 6.517e-03 1.872e-02 0.348
## EFAMTYPESPE17 5.415e-03 5.872e-03 0.922
## EFAMTYPESPE24 -5.940e-03 6.393e-03 -0.929
## EFAMTYPESPN17 2.430e-03 1.352e-02 0.180
## EFAMTYPESPN24 -9.253e-03 1.563e-02 -0.592
## EFAMTYPEOther -1.002e-02 3.194e-03 -3.139
## SCHOOLNFull-time student -4.928e-03 4.396e-03 -1.121
## SCHOOLNPart-time student -2.251e-02 5.771e-03 -3.901
## SCHOOLNUnknown NA NA NA
## CMAToronto -8.723e-03 5.010e-03 -1.741
## CMAVancouver -1.860e-02 5.705e-03 -3.261
## CMAOther -4.095e-03 3.881e-03 -1.055
## UNIONNot member but covered 6.762e-03 5.095e-03 1.327
## UNIONNon-unionized -5.923e-03 1.897e-03 -3.121
## AGYOWNK.L -8.096e-03 4.649e-03 -1.741
## AGYOWNK.Q -9.426e-03 3.163e-03 -2.980
## AGYOWNK.C -3.479e-03 3.547e-03 -0.981
## AGYOWNK^4 -4.587e-04 3.344e-03 -0.137
## MJHMultiple jobholder -5.659e-03 3.572e-03 -1.584
## COWMAINPrivate sector 4.808e-03 3.337e-03 1.441
## Pr(>|t|)
## (Intercept) < 2e-16 ***
## NOC_10Business, finance & administration < 2e-16 ***
## NOC_10Natural & applied sciences < 2e-16 ***
## NOC_10Health < 2e-16 ***
## NOC_10Educ., law, community & gov. services < 2e-16 ***
## NOC_10Art, culture, recreation & sport < 2e-16 ***
## NOC_10Sales & service < 2e-16 ***
## NOC_10Trades, transport & equipm. operators < 2e-16 ***
## NOC_10Natural resources & agriculture < 2e-16 ***
## NOC_10Manufacturing & utilities < 2e-16 ***
## NAICS_18Forestry, Fishing, Min., Oil & Gas < 2e-16 ***
## NAICS_18Utilities < 2e-16 ***
## NAICS_18Construction 2.69e-14 ***
## NAICS_18Manufacturing durables 0.018155 *
## NAICS_18Manufacturing non-durables 0.037463 *
## NAICS_18Wholesale Trade 9.40e-06 ***
## NAICS_18Retail Trade 0.129426
## NAICS_18Transportation & Warehousing 0.097236 .
## NAICS_18Finance, Insurance, Real Est. & Leas. 1.45e-07 ***
## NAICS_18Prof., Scientific & Technical Services 9.31e-12 ***
## NAICS_18Management, Admin. & Support 0.987312
## NAICS_18Educational Services 0.011651 *
## NAICS_18Health Care & Social Assistance 0.037720 *
## NAICS_18Information, Culture & Recreation 0.295910
## NAICS_18Accommodation & Food Services 0.038363 *
## NAICS_18Other Services 0.051281 .
## NAICS_18Public Administration 1.02e-07 ***
## AGE_12.L 0.802421
## AGE_12.Q 9.87e-15 ***
## AGE_12.C 2.23e-05 ***
## AGE_12^4 0.048863 *
## AGE_12^5 0.000493 ***
## AGE_12^6 0.906898
## AGE_12^7 0.019683 *
## AGE_12^8 0.044455 *
## AGE_12^9 0.963875
## AGE_12^10 0.729696
## AGE_12^11 0.276856
## PROVPEI 1.52e-06 ***
## PROVNS 4.83e-05 ***
## PROVNB 3.99e-08 ***
## PROVQC 0.743617
## PROVON 0.000866 ***
## PROVMB 0.873489
## PROVSK 1.78e-05 ***
## PROVAB < 2e-16 ***
## PROVBC 1.79e-10 ***
## EDUC.L < 2e-16 ***
## EDUC.Q 1.60e-11 ***
## EDUC.C 0.013310 *
## EDUC^4 0.193324
## EDUC^5 0.893877
## EDUC^6 0.574242
## TENURE < 2e-16 ***
## FIRMSIZE.L < 2e-16 ***
## FIRMSIZE.Q 0.473011
## FIRMSIZE.C 0.291378
## MARSTATCommon-law 0.031491 *
## MARSTATWidowed 0.048297 *
## MARSTATSeparated 0.611674
## MARSTATDivorced 0.129261
## MARSTATSingle, NM 0.000319 ***
## ESTSIZE.L < 2e-16 ***
## ESTSIZE.Q 1.50e-06 ***
## ESTSIZE.C 0.343338
## FTPTMAINPart-time 5.49e-16 ***
## UTOTHRS 2.46e-07 ***
## PERMTEMPTemp. season 3.04e-07 ***
## PERMTEMPTemp. contract 0.052932 .
## PERMTEMPTemp. casual 0.004557 **
## EFAMTYPEHWDENC 0.221449
## EFAMTYPEHWDE17 0.743525
## EFAMTYPEHWDE24 0.120146
## EFAMTYPEHWSHNC 0.012119 *
## EFAMTYPEHWSH17 0.737991
## EFAMTYPEHWSH24 0.423270
## EFAMTYPEHWSWNC 0.854954
## EFAMTYPEHWSW17 0.852334
## EFAMTYPEHWSW24 0.807214
## EFAMTYPEHWNENC 0.030993 *
## EFAMTYPEHWNE17 0.481223
## EFAMTYPEHWNE24 0.727802
## EFAMTYPESPE17 0.356429
## EFAMTYPESPE24 0.352819
## EFAMTYPESPN17 0.857419
## EFAMTYPESPN24 0.553825
## EFAMTYPEOther 0.001698 **
## SCHOOLNFull-time student 0.262291
## SCHOOLNPart-time student 9.61e-05 ***
## SCHOOLNUnknown NA
## CMAToronto 0.081694 .
## CMAVancouver 0.001113 **
## CMAOther 0.291270
## UNIONNot member but covered 0.184417
## UNIONNon-unionized 0.001803 **
## AGYOWNK.L 0.081644 .
## AGYOWNK.Q 0.002886 **
## AGYOWNK.C 0.326718
## AGYOWNK^4 0.890889
## MJHMultiple jobholder 0.113172
## COWMAINPrivate sector 0.149588
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.09074 on 17749 degrees of freedom
## Multiple R-squared: 0.4989, Adjusted R-squared: 0.4961
## F-statistic: 178.5 on 99 and 17749 DF, p-value: < 2.2e-16
# Diagnostic Plots
# par(mfrow = c(2, 2))
par(mar=c(5.1, 4.1, 4.1, 2.1))
plot(model, lwd = 6)
# Box Cox Transformation (Selected attributes from Stepwise Regression)
bc.model <- lm(formula = HRLYEARN ~ NOC_10 + NAICS_18 + AGE_12 + PROV + EDUC +
TENURE + FIRMSIZE + MARSTAT + ESTSIZE + FTPTMAIN + UTOTHRS +
PERMTEMP + EFAMTYPE + SCHOOLN + CMA + UNION + AGYOWNK + MJH +
COWMAIN
, data = train.19male)
par(mar=c(15,5,2,1))
bc <- boxcox(bc.model, lambda = seq(-3, 3))
bc$x[which(bc$y==max(bc$y))]
## [1] -0.03030303
new.model <- lm(formula = (HRLYEARN)^-0.03 ~ NOC_10 + NAICS_18 + AGE_12 + PROV + EDUC +
TENURE + FIRMSIZE + MARSTAT + ESTSIZE + FTPTMAIN + UTOTHRS +
PERMTEMP + EFAMTYPE + SCHOOLN + CMA + UNION + AGYOWNK + MJH +
COWMAIN, data = train.19male)
summary(new.model)
##
## Call:
## lm(formula = (HRLYEARN)^-0.03 ~ NOC_10 + NAICS_18 + AGE_12 +
## PROV + EDUC + TENURE + FIRMSIZE + MARSTAT + ESTSIZE + FTPTMAIN +
## UTOTHRS + PERMTEMP + EFAMTYPE + SCHOOLN + CMA + UNION + AGYOWNK +
## MJH + COWMAIN, data = train.19male)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.052657 -0.006728 -0.000007 0.006521 0.108772
##
## Coefficients: (1 not defined because of singularities)
## Estimate Std. Error t value
## (Intercept) 1.039e+00 1.344e-03 772.595
## NOC_10Business, finance & administration 1.167e-02 4.298e-04 27.144
## NOC_10Natural & applied sciences 6.925e-03 4.140e-04 16.728
## NOC_10Health 6.627e-03 7.330e-04 9.041
## NOC_10Educ., law, community & gov. services 6.691e-03 4.896e-04 13.667
## NOC_10Art, culture, recreation & sport 1.262e-02 8.064e-04 15.647
## NOC_10Sales & service 1.684e-02 4.020e-04 41.902
## NOC_10Trades, transport & equipm. operators 1.218e-02 3.861e-04 31.544
## NOC_10Natural resources & agriculture 1.423e-02 6.197e-04 22.957
## NOC_10Manufacturing & utilities 1.498e-02 5.021e-04 29.834
## NAICS_18Forestry, Fishing, Min., Oil & Gas -1.241e-02 7.511e-04 -16.526
## NAICS_18Utilities -1.314e-02 1.069e-03 -12.298
## NAICS_18Construction -8.587e-03 8.119e-04 -10.576
## NAICS_18Manufacturing durables -4.158e-03 8.355e-04 -4.977
## NAICS_18Manufacturing non-durables -3.289e-03 8.594e-04 -3.827
## NAICS_18Wholesale Trade -6.059e-03 8.678e-04 -6.982
## NAICS_18Retail Trade 1.044e-03 8.206e-04 1.272
## NAICS_18Transportation & Warehousing -2.591e-03 8.338e-04 -3.108
## NAICS_18Finance, Insurance, Real Est. & Leas. -6.011e-03 8.762e-04 -6.861
## NAICS_18Prof., Scientific & Technical Services -7.569e-03 8.585e-04 -8.816
## NAICS_18Management, Admin. & Support -6.331e-04 8.494e-04 -0.745
## NAICS_18Educational Services -3.404e-03 9.488e-04 -3.588
## NAICS_18Health Care & Social Assistance 6.959e-04 9.345e-04 0.745
## NAICS_18Information, Culture & Recreation -2.467e-03 8.704e-04 -2.834
## NAICS_18Accommodation & Food Services 2.421e-03 8.744e-04 2.769
## NAICS_18Other Services -3.382e-03 8.938e-04 -3.784
## NAICS_18Public Administration -5.901e-03 9.099e-04 -6.485
## AGE_12.L 3.408e-04 6.270e-04 0.544
## AGE_12.Q 7.586e-03 5.637e-04 13.458
## AGE_12.C 1.305e-03 4.602e-04 2.837
## AGE_12^4 5.060e-04 4.104e-04 1.233
## AGE_12^5 1.659e-03 3.768e-04 4.401
## AGE_12^6 -1.540e-04 3.455e-04 -0.446
## AGE_12^7 9.278e-04 3.125e-04 2.969
## AGE_12^8 3.330e-04 2.822e-04 1.180
## AGE_12^9 5.225e-05 2.607e-04 0.200
## AGE_12^10 -6.960e-05 2.506e-04 -0.278
## AGE_12^11 -1.912e-04 2.486e-04 -0.769
## PROVPEI 3.531e-03 6.786e-04 5.204
## PROVNS 2.369e-03 5.852e-04 4.049
## PROVNB 3.296e-03 5.856e-04 5.629
## PROVQC -1.536e-04 5.102e-04 -0.301
## PROVON -2.433e-03 4.886e-04 -4.979
## PROVMB -9.361e-05 5.401e-04 -0.173
## PROVSK -2.849e-03 5.482e-04 -5.197
## PROVAB -7.269e-03 5.182e-04 -14.027
## PROVBC -4.697e-03 5.343e-04 -8.791
## EDUC.L -7.634e-03 4.281e-04 -17.835
## EDUC.Q -1.312e-03 3.954e-04 -3.319
## EDUC.C -3.132e-04 3.116e-04 -1.005
## EDUC^4 4.845e-04 2.995e-04 1.618
## EDUC^5 -1.211e-04 2.081e-04 -0.582
## EDUC^6 -2.353e-04 2.676e-04 -0.879
## TENURE -6.340e-03 2.888e-04 -21.953
## FIRMSIZE.L -2.470e-03 2.301e-04 -10.733
## FIRMSIZE.Q 3.817e-04 2.132e-04 1.790
## FIRMSIZE.C -3.347e-04 2.158e-04 -1.551
## MARSTATCommon-law 2.673e-04 2.521e-04 1.060
## MARSTATWidowed 2.198e-03 1.196e-03 1.838
## MARSTATSeparated -4.209e-04 6.667e-04 -0.631
## MARSTATDivorced -9.903e-04 5.780e-04 -1.713
## MARSTATSingle, NM 1.708e-03 4.143e-04 4.123
## ESTSIZE.L -2.402e-03 2.465e-04 -9.745
## ESTSIZE.Q -8.068e-04 2.041e-04 -3.953
## ESTSIZE.C 2.603e-04 1.814e-04 1.435
## FTPTMAINPart-time 5.357e-03 4.205e-04 12.739
## UTOTHRS 5.095e-03 1.144e-03 4.453
## PERMTEMPTemp. season 2.886e-03 4.069e-04 7.093
## PERMTEMPTemp. contract 7.424e-04 3.657e-04 2.030
## PERMTEMPTemp. casual 2.263e-03 5.098e-04 4.439
## EFAMTYPEHWDENC -7.464e-04 4.361e-04 -1.711
## EFAMTYPEHWDE17 7.805e-04 5.812e-04 1.343
## EFAMTYPEHWDE24 1.253e-03 5.458e-04 2.296
## EFAMTYPEHWSHNC -1.097e-03 5.005e-04 -2.191
## EFAMTYPEHWSH17 9.804e-04 6.578e-04 1.491
## EFAMTYPEHWSH24 4.950e-04 8.669e-04 0.571
## EFAMTYPEHWSWNC 1.694e-03 1.202e-03 1.409
## EFAMTYPEHWSW17 1.462e-03 1.909e-03 0.766
## EFAMTYPEHWSW24 -5.552e-04 1.498e-03 -0.371
## EFAMTYPEHWNENC 2.464e-03 9.107e-04 2.706
## EFAMTYPEHWNE17 7.108e-03 2.649e-03 2.683
## EFAMTYPEHWNE24 -1.888e-04 2.226e-03 -0.085
## EFAMTYPESPE17 -1.586e-04 6.982e-04 -0.227
## EFAMTYPESPE24 1.493e-03 7.601e-04 1.964
## EFAMTYPESPN17 -2.651e-04 1.608e-03 -0.165
## EFAMTYPESPN24 2.183e-03 1.858e-03 1.175
## EFAMTYPEOther 1.731e-03 3.797e-04 4.558
## SCHOOLNFull-time student 8.401e-04 5.227e-04 1.607
## SCHOOLNPart-time student 3.037e-03 6.862e-04 4.426
## SCHOOLNUnknown NA NA NA
## CMAToronto 8.376e-04 5.958e-04 1.406
## CMAVancouver 2.164e-03 6.783e-04 3.191
## CMAOther 5.702e-04 4.614e-04 1.236
## UNIONNot member but covered -7.646e-04 6.058e-04 -1.262
## UNIONNon-unionized 1.915e-03 2.256e-04 8.486
## AGYOWNK.L 1.261e-03 5.528e-04 2.281
## AGYOWNK.Q 1.272e-03 3.761e-04 3.382
## AGYOWNK.C 4.742e-04 4.217e-04 1.124
## AGYOWNK^4 1.671e-04 3.976e-04 0.420
## MJHMultiple jobholder 7.441e-04 4.248e-04 1.752
## COWMAINPrivate sector 3.337e-04 3.968e-04 0.841
## Pr(>|t|)
## (Intercept) < 2e-16 ***
## NOC_10Business, finance & administration < 2e-16 ***
## NOC_10Natural & applied sciences < 2e-16 ***
## NOC_10Health < 2e-16 ***
## NOC_10Educ., law, community & gov. services < 2e-16 ***
## NOC_10Art, culture, recreation & sport < 2e-16 ***
## NOC_10Sales & service < 2e-16 ***
## NOC_10Trades, transport & equipm. operators < 2e-16 ***
## NOC_10Natural resources & agriculture < 2e-16 ***
## NOC_10Manufacturing & utilities < 2e-16 ***
## NAICS_18Forestry, Fishing, Min., Oil & Gas < 2e-16 ***
## NAICS_18Utilities < 2e-16 ***
## NAICS_18Construction < 2e-16 ***
## NAICS_18Manufacturing durables 6.51e-07 ***
## NAICS_18Manufacturing non-durables 0.000130 ***
## NAICS_18Wholesale Trade 3.02e-12 ***
## NAICS_18Retail Trade 0.203213
## NAICS_18Transportation & Warehousing 0.001888 **
## NAICS_18Finance, Insurance, Real Est. & Leas. 7.07e-12 ***
## NAICS_18Prof., Scientific & Technical Services < 2e-16 ***
## NAICS_18Management, Admin. & Support 0.456062
## NAICS_18Educational Services 0.000335 ***
## NAICS_18Health Care & Social Assistance 0.456521
## NAICS_18Information, Culture & Recreation 0.004600 **
## NAICS_18Accommodation & Food Services 0.005627 **
## NAICS_18Other Services 0.000155 ***
## NAICS_18Public Administration 9.08e-11 ***
## AGE_12.L 0.586753
## AGE_12.Q < 2e-16 ***
## AGE_12.C 0.004563 **
## AGE_12^4 0.217689
## AGE_12^5 1.08e-05 ***
## AGE_12^6 0.655704
## AGE_12^7 0.002991 **
## AGE_12^8 0.237998
## AGE_12^9 0.841170
## AGE_12^10 0.781179
## AGE_12^11 0.441900
## PROVPEI 1.98e-07 ***
## PROVNS 5.17e-05 ***
## PROVNB 1.84e-08 ***
## PROVQC 0.763376
## PROVON 6.45e-07 ***
## PROVMB 0.862393
## PROVSK 2.05e-07 ***
## PROVAB < 2e-16 ***
## PROVBC < 2e-16 ***
## EDUC.L < 2e-16 ***
## EDUC.Q 0.000905 ***
## EDUC.C 0.314844
## EDUC^4 0.105728
## EDUC^5 0.560434
## EDUC^6 0.379268
## TENURE < 2e-16 ***
## FIRMSIZE.L < 2e-16 ***
## FIRMSIZE.Q 0.073443 .
## FIRMSIZE.C 0.120931
## MARSTATCommon-law 0.288969
## MARSTATWidowed 0.066055 .
## MARSTATSeparated 0.527849
## MARSTATDivorced 0.086697 .
## MARSTATSingle, NM 3.76e-05 ***
## ESTSIZE.L < 2e-16 ***
## ESTSIZE.Q 7.75e-05 ***
## ESTSIZE.C 0.151296
## FTPTMAINPart-time < 2e-16 ***
## UTOTHRS 8.51e-06 ***
## PERMTEMPTemp. season 1.36e-12 ***
## PERMTEMPTemp. contract 0.042358 *
## PERMTEMPTemp. casual 9.10e-06 ***
## EFAMTYPEHWDENC 0.087013 .
## EFAMTYPEHWDE17 0.179368
## EFAMTYPEHWDE24 0.021695 *
## EFAMTYPEHWSHNC 0.028471 *
## EFAMTYPEHWSH17 0.136104
## EFAMTYPEHWSH24 0.568034
## EFAMTYPEHWSWNC 0.158891
## EFAMTYPEHWSW17 0.443709
## EFAMTYPEHWSW24 0.710996
## EFAMTYPEHWNENC 0.006812 **
## EFAMTYPEHWNE17 0.007294 **
## EFAMTYPEHWNE24 0.932427
## EFAMTYPESPE17 0.820329
## EFAMTYPESPE24 0.049558 *
## EFAMTYPESPN17 0.869080
## EFAMTYPESPN24 0.240192
## EFAMTYPEOther 5.19e-06 ***
## SCHOOLNFull-time student 0.108026
## SCHOOLNPart-time student 9.66e-06 ***
## SCHOOLNUnknown NA
## CMAToronto 0.159738
## CMAVancouver 0.001423 **
## CMAOther 0.216587
## UNIONNot member but covered 0.206887
## UNIONNon-unionized < 2e-16 ***
## AGYOWNK.L 0.022569 *
## AGYOWNK.Q 0.000721 ***
## AGYOWNK.C 0.260905
## AGYOWNK^4 0.674359
## MJHMultiple jobholder 0.079817 .
## COWMAINPrivate sector 0.400269
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.01079 on 17749 degrees of freedom
## Multiple R-squared: 0.552, Adjusted R-squared: 0.5495
## F-statistic: 220.9 on 99 and 17749 DF, p-value: < 2.2e-16
par(mar=c(5.1, 4.1, 4.1, 2.1))
plot(new.model, lwd = 6)
# Residuals Skewness
# Before Transformation:
skewness(model$residuals)
## [1] 1.032538
# After Transformation
skewness(new.model$residuals)
## [1] 0.4713438
# Prediction
prediction <- predict(new.model, interval = "prediction", newdata = test.19male)
## Warning in predict.lm(new.model, interval = "prediction", newdata = test.
## 19male): prediction from a rank-deficient fit may be misleading
# Errors
errors <- prediction[,"fit"] - (test.19male$HRLYEARN^-0.03)
hist(errors)
rmse <- sqrt(sum((errors)^2)/nrow(test.19male))
mae <- (1/nrow(test.19male))*sum(abs(errors))
diff.percent <- 100*(abs(errors)/(test.19male$HRLYEARN^-0.03))
diff.25 <- length(diff.percent[diff.percent<=25])/nrow(test.19male)
paste("RMSE:", rmse)
## [1] "RMSE: 0.0110021049366726"
paste("MAE:", mae)
## [1] "MAE: 0.00830950408353929"
paste("Percentage of cases with less than 25% error:", diff.25*100)
## [1] "Percentage of cases with less than 25% error: 100"
# Females 2009
# Model
full <- lm(HRLYEARN ~ LFSSTAT+PROV+CMA+AGE_12+MARSTAT+EDUC+MJH+COWMAIN
+NAICS_18+NOC_10+FTPTMAIN+UTOTHRS+TENURE+UNION+PERMTEMP
+ESTSIZE+FIRMSIZE+SCHOOLN+EFAMTYPE+AGYOWNK, data = train.09fem)
null <- lm(HRLYEARN ~ 1, data = train.09fem)
model <- stepAIC(null, scope=list(lower=null, upper=full), direction= "both", trace=F) # Trace TRUE to show steps of adding and subtracting vars
summary(model)
##
## Call:
## lm(formula = HRLYEARN ~ NOC_10 + NAICS_18 + EDUC + TENURE + ESTSIZE +
## PROV + AGE_12 + COWMAIN + PERMTEMP + FIRMSIZE + FTPTMAIN +
## UTOTHRS + UNION + AGYOWNK + SCHOOLN + CMA, data = train.09fem)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.26133 -0.03550 -0.00523 0.02602 0.50701
##
## Coefficients: (1 not defined because of singularities)
## Estimate Std. Error t value
## (Intercept) 2.075e-01 9.059e-03 22.904
## NOC_10Business, finance & administration -7.498e-02 2.193e-03 -34.197
## NOC_10Natural & applied sciences -3.951e-02 3.490e-03 -11.322
## NOC_10Health -2.123e-02 2.637e-03 -8.052
## NOC_10Educ., law, community & gov. services -3.279e-02 2.505e-03 -13.090
## NOC_10Art, culture, recreation & sport -5.623e-02 3.860e-03 -14.569
## NOC_10Sales & service -8.616e-02 2.266e-03 -38.024
## NOC_10Trades, transport & equipm. operators -8.815e-02 4.012e-03 -21.974
## NOC_10Natural resources & agriculture -7.752e-02 6.905e-03 -11.227
## NOC_10Manufacturing & utilities -1.015e-01 4.013e-03 -25.282
## NAICS_18Forestry, Fishing, Min., Oil & Gas 7.122e-02 8.168e-03 8.720
## NAICS_18Utilities 4.498e-02 9.716e-03 4.629
## NAICS_18Construction 3.144e-02 8.216e-03 3.826
## NAICS_18Manufacturing durables 3.825e-02 7.882e-03 4.853
## NAICS_18Manufacturing non-durables 1.797e-02 7.747e-03 2.320
## NAICS_18Wholesale Trade 2.638e-02 7.931e-03 3.326
## NAICS_18Retail Trade -5.281e-03 7.328e-03 -0.721
## NAICS_18Transportation & Warehousing 2.109e-02 7.789e-03 2.707
## NAICS_18Finance, Insurance, Real Est. & Leas. 3.000e-02 7.415e-03 4.046
## NAICS_18Prof., Scientific & Technical Services 3.410e-02 7.553e-03 4.515
## NAICS_18Management, Admin. & Support 6.401e-03 7.585e-03 0.844
## NAICS_18Educational Services 1.823e-02 7.543e-03 2.417
## NAICS_18Health Care & Social Assistance 2.895e-03 7.365e-03 0.393
## NAICS_18Information, Culture & Recreation 1.362e-02 7.517e-03 1.811
## NAICS_18Accommodation & Food Services -1.083e-03 7.400e-03 -0.146
## NAICS_18Other Services 1.848e-02 7.527e-03 2.456
## NAICS_18Public Administration 3.788e-02 7.548e-03 5.019
## EDUC.L 6.333e-02 2.601e-03 24.348
## EDUC.Q 2.693e-02 2.338e-03 11.517
## EDUC.C 7.972e-03 1.864e-03 4.278
## EDUC^4 -5.010e-03 1.624e-03 -3.085
## EDUC^5 -5.629e-03 1.172e-03 -4.805
## EDUC^6 -4.598e-03 1.314e-03 -3.499
## TENURE 3.909e-02 1.661e-03 23.541
## ESTSIZE.L 2.316e-02 1.351e-03 17.145
## ESTSIZE.Q 2.206e-03 1.139e-03 1.937
## ESTSIZE.C 8.789e-05 1.033e-03 0.085
## PROVPEI -3.210e-03 3.677e-03 -0.873
## PROVNS 3.114e-03 3.183e-03 0.978
## PROVNB -1.525e-03 3.160e-03 -0.483
## PROVQC 1.095e-02 2.776e-03 3.946
## PROVON 2.325e-02 2.651e-03 8.771
## PROVMB 8.519e-03 2.888e-03 2.950
## PROVSK 1.961e-02 2.988e-03 6.563
## PROVAB 4.030e-02 2.864e-03 14.073
## PROVBC 2.689e-02 3.019e-03 8.906
## AGE_12.L 8.367e-03 4.642e-03 1.802
## AGE_12.Q -2.385e-02 4.715e-03 -5.058
## AGE_12.C -1.874e-03 4.178e-03 -0.449
## AGE_12^4 1.114e-02 3.668e-03 3.037
## AGE_12^5 -5.517e-03 3.198e-03 -1.725
## AGE_12^6 5.822e-03 2.740e-03 2.125
## AGE_12^7 -3.147e-03 2.285e-03 -1.377
## AGE_12^8 1.859e-03 1.894e-03 0.982
## AGE_12^9 2.688e-04 1.610e-03 0.167
## AGE_12^10 5.326e-04 1.426e-03 0.373
## AGE_12^11 -2.374e-03 1.308e-03 -1.815
## COWMAINPrivate sector -1.836e-02 1.771e-03 -10.369
## PERMTEMPTemp. season -1.205e-02 2.889e-03 -4.169
## PERMTEMPTemp. contract -1.133e-02 1.908e-03 -5.939
## PERMTEMPTemp. casual -1.017e-02 2.313e-03 -4.397
## FIRMSIZE.L 6.336e-03 1.202e-03 5.270
## FIRMSIZE.Q 1.592e-03 1.193e-03 1.334
## FIRMSIZE.C 1.346e-03 1.275e-03 1.056
## FTPTMAINPart-time -1.116e-02 1.748e-03 -6.384
## UTOTHRS -3.022e-02 7.155e-03 -4.224
## UNIONNot member but covered 8.201e-03 3.415e-03 2.401
## UNIONNon-unionized -3.812e-03 1.352e-03 -2.819
## AGYOWNK.L -3.545e-03 1.344e-03 -2.638
## AGYOWNK.Q 3.083e-03 1.419e-03 2.173
## AGYOWNK.C 1.063e-04 1.474e-03 0.072
## AGYOWNK^4 1.507e-03 1.471e-03 1.024
## SCHOOLNFull-time student -5.050e-03 2.407e-03 -2.098
## SCHOOLNPart-time student 2.114e-03 2.730e-03 0.774
## SCHOOLNUnknown NA NA NA
## CMAToronto 3.005e-03 3.287e-03 0.914
## CMAVancouver -1.787e-03 3.734e-03 -0.478
## CMAOther -2.091e-03 2.500e-03 -0.836
## Pr(>|t|)
## (Intercept) < 2e-16 ***
## NOC_10Business, finance & administration < 2e-16 ***
## NOC_10Natural & applied sciences < 2e-16 ***
## NOC_10Health 8.63e-16 ***
## NOC_10Educ., law, community & gov. services < 2e-16 ***
## NOC_10Art, culture, recreation & sport < 2e-16 ***
## NOC_10Sales & service < 2e-16 ***
## NOC_10Trades, transport & equipm. operators < 2e-16 ***
## NOC_10Natural resources & agriculture < 2e-16 ***
## NOC_10Manufacturing & utilities < 2e-16 ***
## NAICS_18Forestry, Fishing, Min., Oil & Gas < 2e-16 ***
## NAICS_18Utilities 3.70e-06 ***
## NAICS_18Construction 0.000130 ***
## NAICS_18Manufacturing durables 1.23e-06 ***
## NAICS_18Manufacturing non-durables 0.020365 *
## NAICS_18Wholesale Trade 0.000882 ***
## NAICS_18Retail Trade 0.471159
## NAICS_18Transportation & Warehousing 0.006793 **
## NAICS_18Finance, Insurance, Real Est. & Leas. 5.24e-05 ***
## NAICS_18Prof., Scientific & Technical Services 6.37e-06 ***
## NAICS_18Management, Admin. & Support 0.398750
## NAICS_18Educational Services 0.015640 *
## NAICS_18Health Care & Social Assistance 0.694278
## NAICS_18Information, Culture & Recreation 0.070103 .
## NAICS_18Accommodation & Food Services 0.883633
## NAICS_18Other Services 0.014072 *
## NAICS_18Public Administration 5.24e-07 ***
## EDUC.L < 2e-16 ***
## EDUC.Q < 2e-16 ***
## EDUC.C 1.90e-05 ***
## EDUC^4 0.002042 **
## EDUC^5 1.56e-06 ***
## EDUC^6 0.000468 ***
## TENURE < 2e-16 ***
## ESTSIZE.L < 2e-16 ***
## ESTSIZE.Q 0.052762 .
## ESTSIZE.C 0.932167
## PROVPEI 0.382669
## PROVNS 0.327954
## PROVNB 0.629444
## PROVQC 7.99e-05 ***
## PROVON < 2e-16 ***
## PROVMB 0.003186 **
## PROVSK 5.41e-11 ***
## PROVAB < 2e-16 ***
## PROVBC < 2e-16 ***
## AGE_12.L 0.071489 .
## AGE_12.Q 4.27e-07 ***
## AGE_12.C 0.653695
## AGE_12^4 0.002393 **
## AGE_12^5 0.084478 .
## AGE_12^6 0.033605 *
## AGE_12^7 0.168412
## AGE_12^8 0.326353
## AGE_12^9 0.867382
## AGE_12^10 0.708860
## AGE_12^11 0.069595 .
## COWMAINPrivate sector < 2e-16 ***
## PERMTEMPTemp. season 3.07e-05 ***
## PERMTEMPTemp. contract 2.92e-09 ***
## PERMTEMPTemp. casual 1.10e-05 ***
## FIRMSIZE.L 1.38e-07 ***
## FIRMSIZE.Q 0.182105
## FIRMSIZE.C 0.291076
## FTPTMAINPart-time 1.77e-10 ***
## UTOTHRS 2.41e-05 ***
## UNIONNot member but covered 0.016348 *
## UNIONNon-unionized 0.004825 **
## AGYOWNK.L 0.008338 **
## AGYOWNK.Q 0.029798 *
## AGYOWNK.C 0.942483
## AGYOWNK^4 0.305664
## SCHOOLNFull-time student 0.035956 *
## SCHOOLNPart-time student 0.438821
## SCHOOLNUnknown NA
## CMAToronto 0.360587
## CMAVancouver 0.632330
## CMAOther 0.402976
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.06268 on 19253 degrees of freedom
## Multiple R-squared: 0.5226, Adjusted R-squared: 0.5207
## F-statistic: 277.3 on 76 and 19253 DF, p-value: < 2.2e-16
# Diagnostic Plots
# par(mfrow = c(2, 2))
par(mar=c(5.1, 4.1, 4.1, 2.1))
plot(model, lwd = 6)
# Box Cox Transformation (Selected attributes from Stepwise Regression)
bc.model <- lm(formula = HRLYEARN ~ NOC_10 + NAICS_18 + EDUC + TENURE + ESTSIZE +
PROV + AGE_12 + COWMAIN + PERMTEMP + FIRMSIZE + FTPTMAIN +
UTOTHRS + UNION + AGYOWNK + SCHOOLN + CMA
, data = train.09fem)
par(mar=c(15,5,2,1))
bc <- boxcox(bc.model, lambda = seq(-3, 3))
bc$x[which(bc$y==max(bc$y))]
## [1] -0.1515152
new.model <- lm(formula = (HRLYEARN)^-0.15 ~ NOC_10 + NAICS_18 + AGE_12 + PROV + EDUC +
TENURE + ESTSIZE + AGYOWNK + FIRMSIZE + PERMTEMP + FTPTMAIN +
UTOTHRS + MARSTAT + UNION + LFSSTAT + MJH, data = train.09fem)
summary(new.model)
##
## Call:
## lm(formula = (HRLYEARN)^-0.15 ~ NOC_10 + NAICS_18 + AGE_12 +
## PROV + EDUC + TENURE + ESTSIZE + AGYOWNK + FIRMSIZE + PERMTEMP +
## FTPTMAIN + UTOTHRS + MARSTAT + UNION + LFSSTAT + MJH, data = train.09fem)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.31644 -0.04301 -0.00080 0.04104 0.74188
##
## Coefficients:
## Estimate Std. Error t value
## (Intercept) 1.3541073 0.0098807 137.046
## NOC_10Business, finance & administration 0.0653838 0.0024968 26.187
## NOC_10Natural & applied sciences 0.0335933 0.0039718 8.458
## NOC_10Health 0.0159336 0.0029972 5.316
## NOC_10Educ., law, community & gov. services 0.0313051 0.0028518 10.977
## NOC_10Art, culture, recreation & sport 0.0488085 0.0043996 11.094
## NOC_10Sales & service 0.0920522 0.0025807 35.670
## NOC_10Trades, transport & equipm. operators 0.0884843 0.0045658 19.380
## NOC_10Natural resources & agriculture 0.0730422 0.0078595 9.294
## NOC_10Manufacturing & utilities 0.1105264 0.0045683 24.194
## NAICS_18Forestry, Fishing, Min., Oil & Gas -0.0909705 0.0092997 -9.782
## NAICS_18Utilities -0.0805083 0.0110075 -7.314
## NAICS_18Construction -0.0624766 0.0093523 -6.680
## NAICS_18Manufacturing durables -0.0693666 0.0089749 -7.729
## NAICS_18Manufacturing non-durables -0.0402018 0.0088197 -4.558
## NAICS_18Wholesale Trade -0.0546718 0.0090263 -6.057
## NAICS_18Retail Trade -0.0021852 0.0083444 -0.262
## NAICS_18Transportation & Warehousing -0.0492761 0.0088572 -5.563
## NAICS_18Finance, Insurance, Real Est. & Leas. -0.0585363 0.0084397 -6.936
## NAICS_18Prof., Scientific & Technical Services -0.0630297 0.0085962 -7.332
## NAICS_18Management, Admin. & Support -0.0182464 0.0086363 -2.113
## NAICS_18Educational Services -0.0579554 0.0084723 -6.841
## NAICS_18Health Care & Social Assistance -0.0365374 0.0083553 -4.373
## NAICS_18Information, Culture & Recreation -0.0384057 0.0085543 -4.490
## NAICS_18Accommodation & Food Services -0.0055077 0.0084281 -0.653
## NAICS_18Other Services -0.0395860 0.0085700 -4.619
## NAICS_18Public Administration -0.0779594 0.0084541 -9.222
## AGE_12.L -0.0126098 0.0054948 -2.295
## AGE_12.Q 0.0483309 0.0054258 8.908
## AGE_12.C -0.0122620 0.0047651 -2.573
## AGE_12^4 0.0018906 0.0041823 0.452
## AGE_12^5 0.0068987 0.0036494 1.890
## AGE_12^6 -0.0073045 0.0031228 -2.339
## AGE_12^7 0.0064846 0.0026029 2.491
## AGE_12^8 -0.0035263 0.0021565 -1.635
## AGE_12^9 -0.0006300 0.0018328 -0.344
## AGE_12^10 -0.0007318 0.0016236 -0.451
## AGE_12^11 0.0017424 0.0014896 1.170
## PROVPEI 0.0010888 0.0041870 0.260
## PROVNS -0.0051395 0.0036245 -1.418
## PROVNB 0.0003744 0.0035984 0.104
## PROVQC -0.0209276 0.0031011 -6.749
## PROVON -0.0333490 0.0029829 -11.180
## PROVMB -0.0182755 0.0032895 -5.556
## PROVSK -0.0339569 0.0034036 -9.977
## PROVAB -0.0603071 0.0032609 -18.494
## PROVBC -0.0410203 0.0032133 -12.766
## EDUC.L -0.0686123 0.0029564 -23.208
## EDUC.Q -0.0140310 0.0026620 -5.271
## EDUC.C -0.0056037 0.0021219 -2.641
## EDUC^4 0.0046728 0.0018482 2.528
## EDUC^5 0.0071201 0.0013327 5.343
## EDUC^6 0.0029028 0.0014854 1.954
## TENURE -0.0495809 0.0018875 -26.269
## ESTSIZE.L -0.0251087 0.0015274 -16.439
## ESTSIZE.Q -0.0006827 0.0012910 -0.529
## ESTSIZE.C 0.0002798 0.0011746 0.238
## AGYOWNK.L 0.0017249 0.0015918 1.084
## AGYOWNK.Q -0.0024575 0.0016280 -1.510
## AGYOWNK.C 0.0002202 0.0016845 0.131
## AGYOWNK^4 -0.0028836 0.0016757 -1.721
## FIRMSIZE.L -0.0108208 0.0013576 -7.971
## FIRMSIZE.Q 0.0008824 0.0013584 0.650
## FIRMSIZE.C -0.0021363 0.0014512 -1.472
## PERMTEMPTemp. season 0.0178452 0.0032867 5.430
## PERMTEMPTemp. contract 0.0089444 0.0021669 4.128
## PERMTEMPTemp. casual 0.0115115 0.0026312 4.375
## FTPTMAINPart-time 0.0186885 0.0021900 8.533
## UTOTHRS 0.0203361 0.0090995 2.235
## MARSTATCommon-law 0.0010521 0.0016940 0.621
## MARSTATWidowed 0.0092272 0.0041691 2.213
## MARSTATSeparated 0.0012285 0.0028667 0.429
## MARSTATDivorced 0.0002745 0.0022475 0.122
## MARSTATSingle, NM 0.0035107 0.0016846 2.084
## UNIONNot member but covered -0.0001607 0.0038790 -0.041
## UNIONNon-unionized 0.0207842 0.0014746 14.094
## LFSSTATEmployed, absent from work 0.0021582 0.0018875 1.143
## MJHMultiple jobholder 0.0002582 0.0024231 0.107
## Pr(>|t|)
## (Intercept) < 2e-16 ***
## NOC_10Business, finance & administration < 2e-16 ***
## NOC_10Natural & applied sciences < 2e-16 ***
## NOC_10Health 1.07e-07 ***
## NOC_10Educ., law, community & gov. services < 2e-16 ***
## NOC_10Art, culture, recreation & sport < 2e-16 ***
## NOC_10Sales & service < 2e-16 ***
## NOC_10Trades, transport & equipm. operators < 2e-16 ***
## NOC_10Natural resources & agriculture < 2e-16 ***
## NOC_10Manufacturing & utilities < 2e-16 ***
## NAICS_18Forestry, Fishing, Min., Oil & Gas < 2e-16 ***
## NAICS_18Utilities 2.70e-13 ***
## NAICS_18Construction 2.45e-11 ***
## NAICS_18Manufacturing durables 1.14e-14 ***
## NAICS_18Manufacturing non-durables 5.19e-06 ***
## NAICS_18Wholesale Trade 1.41e-09 ***
## NAICS_18Retail Trade 0.79342
## NAICS_18Transportation & Warehousing 2.68e-08 ***
## NAICS_18Finance, Insurance, Real Est. & Leas. 4.17e-12 ***
## NAICS_18Prof., Scientific & Technical Services 2.35e-13 ***
## NAICS_18Management, Admin. & Support 0.03463 *
## NAICS_18Educational Services 8.12e-12 ***
## NAICS_18Health Care & Social Assistance 1.23e-05 ***
## NAICS_18Information, Culture & Recreation 7.18e-06 ***
## NAICS_18Accommodation & Food Services 0.51345
## NAICS_18Other Services 3.88e-06 ***
## NAICS_18Public Administration < 2e-16 ***
## AGE_12.L 0.02175 *
## AGE_12.Q < 2e-16 ***
## AGE_12.C 0.01008 *
## AGE_12^4 0.65124
## AGE_12^5 0.05873 .
## AGE_12^6 0.01934 *
## AGE_12^7 0.01274 *
## AGE_12^8 0.10203
## AGE_12^9 0.73106
## AGE_12^10 0.65218
## AGE_12^11 0.24210
## PROVPEI 0.79482
## PROVNS 0.15620
## PROVNB 0.91714
## PROVQC 1.54e-11 ***
## PROVON < 2e-16 ***
## PROVMB 2.80e-08 ***
## PROVSK < 2e-16 ***
## PROVAB < 2e-16 ***
## PROVBC < 2e-16 ***
## EDUC.L < 2e-16 ***
## EDUC.Q 1.37e-07 ***
## EDUC.C 0.00828 **
## EDUC^4 0.01147 *
## EDUC^5 9.26e-08 ***
## EDUC^6 0.05069 .
## TENURE < 2e-16 ***
## ESTSIZE.L < 2e-16 ***
## ESTSIZE.Q 0.59692
## ESTSIZE.C 0.81171
## AGYOWNK.L 0.27855
## AGYOWNK.Q 0.13118
## AGYOWNK.C 0.89598
## AGYOWNK^4 0.08529 .
## FIRMSIZE.L 1.66e-15 ***
## FIRMSIZE.Q 0.51598
## FIRMSIZE.C 0.14102
## PERMTEMPTemp. season 5.72e-08 ***
## PERMTEMPTemp. contract 3.68e-05 ***
## PERMTEMPTemp. casual 1.22e-05 ***
## FTPTMAINPart-time < 2e-16 ***
## UTOTHRS 0.02544 *
## MARSTATCommon-law 0.53455
## MARSTATWidowed 0.02689 *
## MARSTATSeparated 0.66826
## MARSTATDivorced 0.90279
## MARSTATSingle, NM 0.03718 *
## UNIONNot member but covered 0.96695
## UNIONNon-unionized < 2e-16 ***
## LFSSTATEmployed, absent from work 0.25286
## MJHMultiple jobholder 0.91514
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.07137 on 19252 degrees of freedom
## Multiple R-squared: 0.5881, Adjusted R-squared: 0.5864
## F-statistic: 356.9 on 77 and 19252 DF, p-value: < 2.2e-16
par(mar=c(5.1, 4.1, 4.1, 2.1))
plot(new.model, lwd = 6)
# Residuals Skewness
# Before Transformation:
skewness(model$residuals)
## [1] 1.181981
# After Transformation
skewness(new.model$residuals)
## [1] 0.5286001
# Test Set Correction!!!
# There is one observation in Test set for Females 2009 where HRLYEARN (hourly wages) is equal to zero.
# This case cannot be analyzed later on to calculate RMSE and MAE.
# It will be excluded from the Test Set.
test.09fem <- test.09fem[-4518,]
# Prediction
prediction <- predict(new.model, interval = "prediction", newdata = test.09fem)
# Errors
errors <- prediction[,"fit"] - (test.09fem$HRLYEARN^(-0.15))
hist(errors)
rmse <- sqrt(sum((errors)^2)/nrow(test.09fem))
mae <- (1/nrow(test.09fem))*sum(abs(errors))
diff.percent <- 100*(abs(errors)/(test.09fem$HRLYEARN^-0.15))
diff.25 <- length(diff.percent[diff.percent<=25])/nrow(test.09fem)
paste("RMSE:", rmse)
## [1] "RMSE: 0.0727054261959815"
paste("MAE:", mae)
## [1] "MAE: 0.0534999014295685"
paste("Percentage of cases with less than 25% error:", diff.25*100)
## [1] "Percentage of cases with less than 25% error: 99.8430709802028"
# Females 2019
# Model
full <- lm(HRLYEARN ~ LFSSTAT+PROV+CMA+AGE_12+MARSTAT+EDUC+MJH+COWMAIN
+NAICS_18+NOC_10+FTPTMAIN+UTOTHRS+TENURE+UNION+PERMTEMP
+ESTSIZE+FIRMSIZE+SCHOOLN+EFAMTYPE+AGYOWNK, data = train.19fem)
null <- lm(HRLYEARN ~ 1, data = train.19fem)
model <- stepAIC(null, scope=list(lower=null, upper=full), direction= "both", trace=F) # Trace TRUE to show steps of adding and subtracting vars
summary(model)
##
## Call:
## lm(formula = HRLYEARN ~ NOC_10 + EDUC + TENURE + NAICS_18 + PROV +
## ESTSIZE + AGE_12 + COWMAIN + PERMTEMP + FIRMSIZE + FTPTMAIN +
## UTOTHRS + EFAMTYPE + LFSSTAT + SCHOOLN + MARSTAT, data = train.19fem)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.35100 -0.04429 -0.00725 0.03222 0.70051
##
## Coefficients: (1 not defined because of singularities)
## Estimate Std. Error t value
## (Intercept) 2.903e-01 1.195e-02 24.297
## NOC_10Business, finance & administration -1.021e-01 2.950e-03 -34.631
## NOC_10Natural & applied sciences -4.995e-02 4.288e-03 -11.651
## NOC_10Health -5.306e-02 3.412e-03 -15.548
## NOC_10Educ., law, community & gov. services -7.682e-02 3.199e-03 -24.010
## NOC_10Art, culture, recreation & sport -1.055e-01 5.175e-03 -20.385
## NOC_10Sales & service -1.275e-01 3.084e-03 -41.359
## NOC_10Trades, transport & equipm. operators -1.081e-01 5.059e-03 -21.361
## NOC_10Natural resources & agriculture -9.893e-02 9.121e-03 -10.847
## NOC_10Manufacturing & utilities -1.363e-01 5.493e-03 -24.807
## EDUC.L 8.306e-02 3.977e-03 20.886
## EDUC.Q 3.829e-02 3.681e-03 10.401
## EDUC.C 2.056e-02 2.912e-03 7.060
## EDUC^4 -6.162e-03 2.458e-03 -2.507
## EDUC^5 -5.867e-03 1.673e-03 -3.506
## EDUC^6 -3.233e-03 1.929e-03 -1.676
## TENURE 5.823e-02 2.113e-03 27.557
## NAICS_18Forestry, Fishing, Min., Oil & Gas 1.154e-01 1.045e-02 11.045
## NAICS_18Utilities 1.092e-01 1.323e-02 8.257
## NAICS_18Construction 5.067e-02 1.092e-02 4.639
## NAICS_18Manufacturing durables 4.871e-02 1.101e-02 4.425
## NAICS_18Manufacturing non-durables 3.603e-02 1.082e-02 3.330
## NAICS_18Wholesale Trade 5.322e-02 1.096e-02 4.854
## NAICS_18Retail Trade 1.372e-02 1.027e-02 1.336
## NAICS_18Transportation & Warehousing 3.342e-02 1.075e-02 3.109
## NAICS_18Finance, Insurance, Real Est. & Leas. 6.138e-02 1.037e-02 5.917
## NAICS_18Prof., Scientific & Technical Services 5.533e-02 1.046e-02 5.291
## NAICS_18Management, Admin. & Support 2.594e-02 1.051e-02 2.469
## NAICS_18Educational Services 3.497e-02 1.050e-02 3.331
## NAICS_18Health Care & Social Assistance 1.447e-02 1.028e-02 1.408
## NAICS_18Information, Culture & Recreation 2.704e-02 1.060e-02 2.551
## NAICS_18Accommodation & Food Services 2.081e-02 1.037e-02 2.007
## NAICS_18Other Services 3.217e-02 1.055e-02 3.049
## NAICS_18Public Administration 5.522e-02 1.047e-02 5.276
## PROVPEI -1.769e-02 4.654e-03 -3.802
## PROVNS -2.628e-02 4.078e-03 -6.444
## PROVNB -2.415e-02 4.151e-03 -5.817
## PROVQC -4.173e-03 3.547e-03 -1.177
## PROVON 8.836e-03 3.420e-03 2.584
## PROVMB -9.988e-03 3.816e-03 -2.617
## PROVSK 4.154e-03 3.878e-03 1.071
## PROVAB 2.437e-02 3.722e-03 6.547
## PROVBC 7.478e-03 3.630e-03 2.060
## ESTSIZE.L 1.873e-02 1.695e-03 11.049
## ESTSIZE.Q 3.767e-03 1.412e-03 2.667
## ESTSIZE.C -4.077e-04 1.312e-03 -0.311
## AGE_12.L -1.168e-02 4.818e-03 -2.424
## AGE_12.Q -2.766e-02 4.239e-03 -6.525
## AGE_12.C -2.093e-02 3.609e-03 -5.800
## AGE_12^4 7.594e-03 3.235e-03 2.348
## AGE_12^5 -7.689e-03 2.925e-03 -2.629
## AGE_12^6 -2.822e-03 2.602e-03 -1.085
## AGE_12^7 -2.552e-03 2.310e-03 -1.105
## AGE_12^8 -2.633e-03 2.062e-03 -1.277
## AGE_12^9 1.095e-03 1.873e-03 0.585
## AGE_12^10 2.825e-04 1.766e-03 0.160
## AGE_12^11 1.232e-03 1.744e-03 0.707
## COWMAINPrivate sector -2.868e-02 2.132e-03 -13.454
## PERMTEMPTemp. season -1.707e-02 3.909e-03 -4.366
## PERMTEMPTemp. contract -1.544e-02 2.416e-03 -6.391
## PERMTEMPTemp. casual -1.862e-02 2.898e-03 -6.425
## FIRMSIZE.L 9.263e-03 1.594e-03 5.812
## FIRMSIZE.Q 1.981e-03 1.510e-03 1.312
## FIRMSIZE.C -3.033e-04 1.596e-03 -0.190
## FTPTMAINPart-time -1.672e-02 2.232e-03 -7.491
## UTOTHRS -4.984e-02 8.860e-03 -5.625
## EFAMTYPEHWDENC -4.107e-03 2.996e-03 -1.371
## EFAMTYPEHWDE17 1.424e-04 2.964e-03 0.048
## EFAMTYPEHWDE24 -5.367e-03 3.232e-03 -1.661
## EFAMTYPEHWSHNC -4.386e-03 8.755e-03 -0.501
## EFAMTYPEHWSH17 -5.218e-03 7.655e-03 -0.682
## EFAMTYPEHWSH24 1.503e-02 1.028e-02 1.463
## EFAMTYPEHWSWNC -7.142e-03 3.678e-03 -1.942
## EFAMTYPEHWSW17 -8.043e-03 5.055e-03 -1.591
## EFAMTYPEHWSW24 -1.498e-02 6.834e-03 -2.192
## EFAMTYPEHWNENC -1.679e-02 7.322e-03 -2.293
## EFAMTYPEHWNE17 -1.304e-02 1.890e-02 -0.690
## EFAMTYPEHWNE24 -6.159e-03 1.700e-02 -0.362
## EFAMTYPESPE17 -3.546e-03 2.805e-03 -1.264
## EFAMTYPESPE24 -1.110e-02 4.316e-03 -2.571
## EFAMTYPESPN17 -3.399e-03 1.344e-02 -0.253
## EFAMTYPESPN24 -1.034e-02 1.361e-02 -0.760
## EFAMTYPEOther -9.108e-03 2.723e-03 -3.344
## LFSSTATEmployed, absent from work -5.284e-03 2.060e-03 -2.565
## SCHOOLNFull-time student -6.879e-03 3.138e-03 -2.192
## SCHOOLNPart-time student 1.235e-03 3.696e-03 0.334
## SCHOOLNUnknown NA NA NA
## MARSTATCommon-law 4.295e-05 1.800e-03 0.024
## MARSTATWidowed -7.476e-03 5.341e-03 -1.400
## MARSTATSeparated 6.652e-03 4.088e-03 1.627
## MARSTATDivorced -2.637e-03 3.547e-03 -0.744
## MARSTATSingle, NM -3.683e-03 2.711e-03 -1.359
## Pr(>|t|)
## (Intercept) < 2e-16 ***
## NOC_10Business, finance & administration < 2e-16 ***
## NOC_10Natural & applied sciences < 2e-16 ***
## NOC_10Health < 2e-16 ***
## NOC_10Educ., law, community & gov. services < 2e-16 ***
## NOC_10Art, culture, recreation & sport < 2e-16 ***
## NOC_10Sales & service < 2e-16 ***
## NOC_10Trades, transport & equipm. operators < 2e-16 ***
## NOC_10Natural resources & agriculture < 2e-16 ***
## NOC_10Manufacturing & utilities < 2e-16 ***
## EDUC.L < 2e-16 ***
## EDUC.Q < 2e-16 ***
## EDUC.C 1.73e-12 ***
## EDUC^4 0.012183 *
## EDUC^5 0.000456 ***
## EDUC^6 0.093717 .
## TENURE < 2e-16 ***
## NAICS_18Forestry, Fishing, Min., Oil & Gas < 2e-16 ***
## NAICS_18Utilities < 2e-16 ***
## NAICS_18Construction 3.53e-06 ***
## NAICS_18Manufacturing durables 9.69e-06 ***
## NAICS_18Manufacturing non-durables 0.000872 ***
## NAICS_18Wholesale Trade 1.22e-06 ***
## NAICS_18Retail Trade 0.181606
## NAICS_18Transportation & Warehousing 0.001883 **
## NAICS_18Finance, Insurance, Real Est. & Leas. 3.34e-09 ***
## NAICS_18Prof., Scientific & Technical Services 1.23e-07 ***
## NAICS_18Management, Admin. & Support 0.013565 *
## NAICS_18Educational Services 0.000866 ***
## NAICS_18Health Care & Social Assistance 0.159135
## NAICS_18Information, Culture & Recreation 0.010765 *
## NAICS_18Accommodation & Food Services 0.044802 *
## NAICS_18Other Services 0.002297 **
## NAICS_18Public Administration 1.33e-07 ***
## PROVPEI 0.000144 ***
## PROVNS 1.20e-10 ***
## PROVNB 6.10e-09 ***
## PROVQC 0.239312
## PROVON 0.009780 **
## PROVMB 0.008868 **
## PROVSK 0.284191
## PROVAB 6.03e-11 ***
## PROVBC 0.039414 *
## ESTSIZE.L < 2e-16 ***
## ESTSIZE.Q 0.007653 **
## ESTSIZE.C 0.755955
## AGE_12.L 0.015364 *
## AGE_12.Q 6.99e-11 ***
## AGE_12.C 6.74e-09 ***
## AGE_12^4 0.018903 *
## AGE_12^5 0.008584 **
## AGE_12^6 0.278109
## AGE_12^7 0.269275
## AGE_12^8 0.201584
## AGE_12^9 0.558889
## AGE_12^10 0.872913
## AGE_12^11 0.479791
## COWMAINPrivate sector < 2e-16 ***
## PERMTEMPTemp. season 1.27e-05 ***
## PERMTEMPTemp. contract 1.69e-10 ***
## PERMTEMPTemp. casual 1.35e-10 ***
## FIRMSIZE.L 6.30e-09 ***
## FIRMSIZE.Q 0.189563
## FIRMSIZE.C 0.849248
## FTPTMAINPart-time 7.15e-14 ***
## UTOTHRS 1.88e-08 ***
## EFAMTYPEHWDENC 0.170496
## EFAMTYPEHWDE17 0.961690
## EFAMTYPEHWDE24 0.096770 .
## EFAMTYPEHWSHNC 0.616447
## EFAMTYPEHWSH17 0.495480
## EFAMTYPEHWSH24 0.143593
## EFAMTYPEHWSWNC 0.052155 .
## EFAMTYPEHWSW17 0.111591
## EFAMTYPEHWSW24 0.028405 *
## EFAMTYPEHWNENC 0.021864 *
## EFAMTYPEHWNE17 0.490450
## EFAMTYPEHWNE24 0.717066
## EFAMTYPESPE17 0.206294
## EFAMTYPESPE24 0.010151 *
## EFAMTYPESPN17 0.800343
## EFAMTYPESPN24 0.447166
## EFAMTYPEOther 0.000826 ***
## LFSSTATEmployed, absent from work 0.010321 *
## SCHOOLNFull-time student 0.028390 *
## SCHOOLNPart-time student 0.738281
## SCHOOLNUnknown NA
## MARSTATCommon-law 0.980963
## MARSTATWidowed 0.161598
## MARSTATSeparated 0.103728
## MARSTATDivorced 0.457165
## MARSTATSingle, NM 0.174288
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.077 on 17711 degrees of freedom
## Multiple R-squared: 0.516, Adjusted R-squared: 0.5135
## F-statistic: 209.8 on 90 and 17711 DF, p-value: < 2.2e-16
# Diagnostic Plots
# par(mfrow = c(2, 2))
par(mar=c(5.1, 4.1, 4.1, 2.1))
plot(model, lwd = 6)
# Box Cox Transformation (Selected attributes from Stepwise Regression)
bc.model <- lm(formula = HRLYEARN ~ NOC_10 + EDUC + TENURE + NAICS_18 + PROV +
ESTSIZE + AGE_12 + COWMAIN + PERMTEMP + FIRMSIZE + FTPTMAIN +
UTOTHRS + EFAMTYPE + LFSSTAT + SCHOOLN + MARSTAT
, data = train.19fem)
par(mar=c(15,5,2,1))
bc <- boxcox(bc.model, lambda = seq(-3, 3))
bc$x[which(bc$y==max(bc$y))]
## [1] -0.1515152
new.model <- lm(formula = (HRLYEARN)^-0.15 ~ NOC_10 + EDUC + TENURE + NAICS_18 + PROV +
ESTSIZE + AGE_12 + COWMAIN + PERMTEMP + FIRMSIZE + FTPTMAIN +
UTOTHRS + EFAMTYPE + LFSSTAT + SCHOOLN + MARSTAT, data = train.19fem)
summary(new.model)
##
## Call:
## lm(formula = (HRLYEARN)^-0.15 ~ NOC_10 + EDUC + TENURE + NAICS_18 +
## PROV + ESTSIZE + AGE_12 + COWMAIN + PERMTEMP + FIRMSIZE +
## FTPTMAIN + UTOTHRS + EFAMTYPE + LFSSTAT + SCHOOLN + MARSTAT,
## data = train.19fem)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.34768 -0.03654 0.00094 0.03679 0.78197
##
## Coefficients: (1 not defined because of singularities)
## Estimate Std. Error t value
## (Intercept) 1.2645149 0.0096986 130.382
## NOC_10Business, finance & administration 0.0632019 0.0023946 26.393
## NOC_10Natural & applied sciences 0.0277562 0.0034809 7.974
## NOC_10Health 0.0266160 0.0027704 9.607
## NOC_10Educ., law, community & gov. services 0.0488443 0.0025974 18.805
## NOC_10Art, culture, recreation & sport 0.0663367 0.0042015 15.789
## NOC_10Sales & service 0.0943841 0.0025035 37.701
## NOC_10Trades, transport & equipm. operators 0.0731511 0.0041075 17.809
## NOC_10Natural resources & agriculture 0.0629183 0.0074044 8.497
## NOC_10Manufacturing & utilities 0.1068319 0.0044597 23.955
## EDUC.L -0.0657568 0.0032288 -20.366
## EDUC.Q -0.0156022 0.0029886 -5.221
## EDUC.C -0.0143191 0.0023639 -6.057
## EDUC^4 0.0081766 0.0019953 4.098
## EDUC^5 0.0049324 0.0013584 3.631
## EDUC^6 0.0004752 0.0015660 0.303
## TENURE -0.0512488 0.0017153 -29.877
## NAICS_18Forestry, Fishing, Min., Oil & Gas -0.0989629 0.0084834 -11.665
## NAICS_18Utilities -0.0948650 0.0107376 -8.835
## NAICS_18Construction -0.0598373 0.0088688 -6.747
## NAICS_18Manufacturing durables -0.0649400 0.0089370 -7.266
## NAICS_18Manufacturing non-durables -0.0492158 0.0087846 -5.603
## NAICS_18Wholesale Trade -0.0644516 0.0089000 -7.242
## NAICS_18Retail Trade -0.0186889 0.0083402 -2.241
## NAICS_18Transportation & Warehousing -0.0441720 0.0087272 -5.061
## NAICS_18Finance, Insurance, Real Est. & Leas. -0.0696109 0.0084226 -8.265
## NAICS_18Prof., Scientific & Technical Services -0.0635029 0.0084891 -7.480
## NAICS_18Management, Admin. & Support -0.0341059 0.0085290 -3.999
## NAICS_18Educational Services -0.0446031 0.0085222 -5.234
## NAICS_18Health Care & Social Assistance -0.0312260 0.0083429 -3.743
## NAICS_18Information, Culture & Recreation -0.0364530 0.0086060 -4.236
## NAICS_18Accommodation & Food Services -0.0228528 0.0084177 -2.715
## NAICS_18Other Services -0.0395185 0.0085653 -4.614
## NAICS_18Public Administration -0.0600222 0.0084973 -7.064
## PROVPEI 0.0082055 0.0037783 2.172
## PROVNS 0.0202298 0.0033103 6.111
## PROVNB 0.0171803 0.0033700 5.098
## PROVQC -0.0040035 0.0028793 -1.390
## PROVON -0.0146772 0.0027765 -5.286
## PROVMB 0.0067135 0.0030981 2.167
## PROVSK -0.0067018 0.0031486 -2.129
## PROVAB -0.0298605 0.0030216 -9.882
## PROVBC -0.0184483 0.0029473 -6.259
## ESTSIZE.L -0.0142272 0.0013763 -10.337
## ESTSIZE.Q -0.0018528 0.0011465 -1.616
## ESTSIZE.C 0.0014875 0.0010649 1.397
## AGE_12.L 0.0112419 0.0039114 2.874
## AGE_12.Q 0.0348341 0.0034411 10.123
## AGE_12.C 0.0089578 0.0029301 3.057
## AGE_12^4 -0.0002968 0.0026260 -0.113
## AGE_12^5 0.0083008 0.0023750 3.495
## AGE_12^6 0.0002822 0.0021121 0.134
## AGE_12^7 0.0052047 0.0018755 2.775
## AGE_12^8 0.0021152 0.0016741 1.264
## AGE_12^9 -0.0009318 0.0015209 -0.613
## AGE_12^10 -0.0002565 0.0014338 -0.179
## AGE_12^11 -0.0010725 0.0014157 -0.758
## COWMAINPrivate sector 0.0275927 0.0017307 15.943
## PERMTEMPTemp. season 0.0184655 0.0031736 5.819
## PERMTEMPTemp. contract 0.0085185 0.0019613 4.343
## PERMTEMPTemp. casual 0.0160010 0.0023527 6.801
## FIRMSIZE.L -0.0099365 0.0012940 -7.679
## FIRMSIZE.Q 0.0006736 0.0012258 0.550
## FIRMSIZE.C -0.0005100 0.0012953 -0.394
## FTPTMAINPart-time 0.0194216 0.0018118 10.719
## UTOTHRS 0.0367328 0.0071932 5.107
## EFAMTYPEHWDENC 0.0008833 0.0024325 0.363
## EFAMTYPEHWDE17 0.0005895 0.0024060 0.245
## EFAMTYPEHWDE24 0.0041420 0.0026235 1.579
## EFAMTYPEHWSHNC 0.0085380 0.0071081 1.201
## EFAMTYPEHWSH17 0.0102411 0.0062146 1.648
## EFAMTYPEHWSH24 -0.0131896 0.0083437 -1.581
## EFAMTYPEHWSWNC 0.0048180 0.0029858 1.614
## EFAMTYPEHWSW17 0.0106901 0.0041038 2.605
## EFAMTYPEHWSW24 0.0143981 0.0055481 2.595
## EFAMTYPEHWNENC 0.0131091 0.0059444 2.205
## EFAMTYPEHWNE17 0.0185854 0.0153457 1.211
## EFAMTYPEHWNE24 0.0038376 0.0137987 0.278
## EFAMTYPESPE17 0.0058985 0.0022775 2.590
## EFAMTYPESPE24 0.0086710 0.0035037 2.475
## EFAMTYPESPN17 0.0055439 0.0109115 0.508
## EFAMTYPESPN24 0.0133693 0.0110482 1.210
## EFAMTYPEOther 0.0082649 0.0022109 3.738
## LFSSTATEmployed, absent from work 0.0028700 0.0016724 1.716
## SCHOOLNFull-time student 0.0086524 0.0025476 3.396
## SCHOOLNPart-time student 0.0011124 0.0030006 0.371
## SCHOOLNUnknown NA NA NA
## MARSTATCommon-law -0.0008973 0.0014612 -0.614
## MARSTATWidowed 0.0029769 0.0043359 0.687
## MARSTATSeparated -0.0092692 0.0033188 -2.793
## MARSTATDivorced 0.0003429 0.0028796 0.119
## MARSTATSingle, NM 0.0027031 0.0022009 1.228
## Pr(>|t|)
## (Intercept) < 2e-16 ***
## NOC_10Business, finance & administration < 2e-16 ***
## NOC_10Natural & applied sciences 1.63e-15 ***
## NOC_10Health < 2e-16 ***
## NOC_10Educ., law, community & gov. services < 2e-16 ***
## NOC_10Art, culture, recreation & sport < 2e-16 ***
## NOC_10Sales & service < 2e-16 ***
## NOC_10Trades, transport & equipm. operators < 2e-16 ***
## NOC_10Natural resources & agriculture < 2e-16 ***
## NOC_10Manufacturing & utilities < 2e-16 ***
## EDUC.L < 2e-16 ***
## EDUC.Q 1.80e-07 ***
## EDUC.C 1.41e-09 ***
## EDUC^4 4.19e-05 ***
## EDUC^5 0.000283 ***
## EDUC^6 0.761548
## TENURE < 2e-16 ***
## NAICS_18Forestry, Fishing, Min., Oil & Gas < 2e-16 ***
## NAICS_18Utilities < 2e-16 ***
## NAICS_18Construction 1.56e-11 ***
## NAICS_18Manufacturing durables 3.84e-13 ***
## NAICS_18Manufacturing non-durables 2.14e-08 ***
## NAICS_18Wholesale Trade 4.61e-13 ***
## NAICS_18Retail Trade 0.025049 *
## NAICS_18Transportation & Warehousing 4.20e-07 ***
## NAICS_18Finance, Insurance, Real Est. & Leas. < 2e-16 ***
## NAICS_18Prof., Scientific & Technical Services 7.75e-14 ***
## NAICS_18Management, Admin. & Support 6.39e-05 ***
## NAICS_18Educational Services 1.68e-07 ***
## NAICS_18Health Care & Social Assistance 0.000183 ***
## NAICS_18Information, Culture & Recreation 2.29e-05 ***
## NAICS_18Accommodation & Food Services 0.006637 **
## NAICS_18Other Services 3.98e-06 ***
## NAICS_18Public Administration 1.68e-12 ***
## PROVPEI 0.029890 *
## PROVNS 1.01e-09 ***
## PROVNB 3.47e-07 ***
## PROVQC 0.164415
## PROVON 1.26e-07 ***
## PROVMB 0.030250 *
## PROVSK 0.033308 *
## PROVAB < 2e-16 ***
## PROVBC 3.95e-10 ***
## ESTSIZE.L < 2e-16 ***
## ESTSIZE.Q 0.106111
## ESTSIZE.C 0.162469
## AGE_12.L 0.004056 **
## AGE_12.Q < 2e-16 ***
## AGE_12.C 0.002238 **
## AGE_12^4 0.910013
## AGE_12^5 0.000475 ***
## AGE_12^6 0.893695
## AGE_12^7 0.005525 **
## AGE_12^8 0.206423
## AGE_12^9 0.540125
## AGE_12^10 0.858006
## AGE_12^11 0.448722
## COWMAINPrivate sector < 2e-16 ***
## PERMTEMPTemp. season 6.04e-09 ***
## PERMTEMPTemp. contract 1.41e-05 ***
## PERMTEMPTemp. casual 1.07e-11 ***
## FIRMSIZE.L 1.69e-14 ***
## FIRMSIZE.Q 0.582668
## FIRMSIZE.C 0.693809
## FTPTMAINPart-time < 2e-16 ***
## UTOTHRS 3.31e-07 ***
## EFAMTYPEHWDENC 0.716525
## EFAMTYPEHWDE17 0.806441
## EFAMTYPEHWDE24 0.114401
## EFAMTYPEHWSHNC 0.229702
## EFAMTYPEHWSH17 0.099388 .
## EFAMTYPEHWSH24 0.113947
## EFAMTYPEHWSWNC 0.106624
## EFAMTYPEHWSW17 0.009197 **
## EFAMTYPEHWSW24 0.009464 **
## EFAMTYPEHWNENC 0.027448 *
## EFAMTYPEHWNE17 0.225868
## EFAMTYPEHWNE24 0.780926
## EFAMTYPESPE17 0.009608 **
## EFAMTYPESPE24 0.013340 *
## EFAMTYPESPN17 0.611402
## EFAMTYPESPN24 0.226260
## EFAMTYPEOther 0.000186 ***
## LFSSTATEmployed, absent from work 0.086171 .
## SCHOOLNFull-time student 0.000685 ***
## SCHOOLNPart-time student 0.710844
## SCHOOLNUnknown NA
## MARSTATCommon-law 0.539187
## MARSTATWidowed 0.492362
## MARSTATSeparated 0.005228 **
## MARSTATDivorced 0.905203
## MARSTATSingle, NM 0.219413
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.06251 on 17711 degrees of freedom
## Multiple R-squared: 0.5654, Adjusted R-squared: 0.5632
## F-statistic: 256.1 on 90 and 17711 DF, p-value: < 2.2e-16
par(mar=c(5.1, 4.1, 4.1, 2.1))
plot(new.model, lwd = 6)
# Residuals Skewness
# Before Transformation:
skewness(model$residuals)
## [1] 1.290823
# After Transformation
skewness(new.model$residuals)
## [1] 0.7025932
# Prediction
prediction <- predict(new.model, interval = "prediction", newdata = test.19fem)
## Warning in predict.lm(new.model, interval = "prediction", newdata = test.19fem):
## prediction from a rank-deficient fit may be misleading
# Errors
errors <- prediction[,"fit"] - (test.19fem$HRLYEARN^(-0.15))
hist(errors)
rmse <- sqrt(sum((errors)^2)/nrow(test.19fem))
mae <- (1/nrow(test.19fem))*sum(abs(errors))
diff.percent <- 100*(abs(errors)/(test.19fem$HRLYEARN^-0.15))
diff.25 <- length(diff.percent[diff.percent<=25])/nrow(test.19fem)
paste("RMSE:", rmse)
## [1] "RMSE: 0.0642458061529313"
paste("MAE:", mae)
## [1] "MAE: 0.0470490092790988"
paste("Percentage of cases with less than 25% error:", diff.25*100)
## [1] "Percentage of cases with less than 25% error: 99.8951507208388"
# write.csv(data.all, file = "Data_All.csv", row.names=FALSE)
# SECTOR #######################################################################
# Private
prop.test(x = c(nrow(data.all.09fem[data.all.09fem$COWMAIN == "Private sector",])
, nrow(data.all.19fem[data.all.19fem$COWMAIN == "Private sector",]))
, n = c(nrow(data.all[data.all$SURVYEAR == 2009 & data.all$COWMAIN == "Private sector",])
, nrow(data.all[data.all$SURVYEAR == 2019 & data.all$COWMAIN == "Private sector",]))
, alternative = "greater"
, conf.level = 0.95)
##
## 2-sample test for equality of proportions with continuity correction
##
## data: c(nrow(data.all.09fem[data.all.09fem$COWMAIN == "Private sector", ]), nrow(data.all.19fem[data.all.19fem$COWMAIN == "Private sector", ])) out of c(nrow(data.all[data.all$SURVYEAR == 2009 & data.all$COWMAIN == "Private sector", ]), nrow(data.all[data.all$SURVYEAR == 2019 & data.all$COWMAIN == "Private sector", ]))
## X-squared = 15.371, df = 1, p-value = 4.417e-05
## alternative hypothesis: greater
## 95 percent confidence interval:
## 0.008155203 1.000000000
## sample estimates:
## prop 1 prop 2
## 0.4612816 0.4472080
# Public
prop.test(x = c(nrow(data.all.09fem[data.all.09fem$COWMAIN == "Public sector",])
, nrow(data.all.19fem[data.all.19fem$COWMAIN == "Public sector",]))
, n = c(nrow(data.all[data.all$SURVYEAR == 2009 & data.all$COWMAIN == "Public sector",])
, nrow(data.all[data.all$SURVYEAR == 2019 & data.all$COWMAIN == "Public sector",]))
, alternative = "less"
, conf.level = 0.95)
##
## 2-sample test for equality of proportions with continuity correction
##
## data: c(nrow(data.all.09fem[data.all.09fem$COWMAIN == "Public sector", ]), nrow(data.all.19fem[data.all.19fem$COWMAIN == "Public sector", ])) out of c(nrow(data.all[data.all$SURVYEAR == 2009 & data.all$COWMAIN == "Public sector", ]), nrow(data.all[data.all$SURVYEAR == 2019 & data.all$COWMAIN == "Public sector", ]))
## X-squared = 3.0416, df = 1, p-value = 0.04058
## alternative hypothesis: less
## 95 percent confidence interval:
## -1.0000000000 -0.0005711443
## sample estimates:
## prop 1 prop 2
## 0.6309408 0.6410219
# INDUSTRY ####################################################################
# Construction
prop.test(x = c(nrow(data.all.09fem[data.all.09fem$NAICS_18short == "Const",])
, nrow(data.all.19fem[data.all.19fem$NAICS_18short == "Const",]))
, n = c(nrow(data.all[data.all$SURVYEAR == 2009 & data.all$NAICS_18short == "Const",])
, nrow(data.all[data.all$SURVYEAR == 2019 & data.all$NAICS_18short == "Const",]))
, alternative = "less"
, conf.level = 0.95)
##
## 2-sample test for equality of proportions with continuity correction
##
## data: c(nrow(data.all.09fem[data.all.09fem$NAICS_18short == "Const", ]), nrow(data.all.19fem[data.all.19fem$NAICS_18short == "Const", ])) out of c(nrow(data.all[data.all$SURVYEAR == 2009 & data.all$NAICS_18short == "Const", ]), nrow(data.all[data.all$SURVYEAR == 2019 & data.all$NAICS_18short == "Const", ]))
## X-squared = 8.7586, df = 1, p-value = 0.001541
## alternative hypothesis: less
## 95 percent confidence interval:
## -1.000000000 -0.009921907
## sample estimates:
## prop 1 prop 2
## 0.1061281 0.1286706
# Forestry, Fishing, Min., Oil & Gas
prop.test(x = c(nrow(data.all.09fem[data.all.09fem$NAICS_18short == "Fores",])
, nrow(data.all.19fem[data.all.19fem$NAICS_18short == "Fores",]))
, n = c(nrow(data.all[data.all$SURVYEAR == 2009 & data.all$NAICS_18short == "Fores",])
, nrow(data.all[data.all$SURVYEAR == 2019 & data.all$NAICS_18short == "Fores",]))
, alternative = "less"
, conf.level = 0.95)
##
## 2-sample test for equality of proportions with continuity correction
##
## data: c(nrow(data.all.09fem[data.all.09fem$NAICS_18short == "Fores", ]), nrow(data.all.19fem[data.all.19fem$NAICS_18short == "Fores", ])) out of c(nrow(data.all[data.all$SURVYEAR == 2009 & data.all$NAICS_18short == "Fores", ]), nrow(data.all[data.all$SURVYEAR == 2019 & data.all$NAICS_18short == "Fores", ]))
## X-squared = 0.58053, df = 1, p-value = 0.2231
## alternative hypothesis: less
## 95 percent confidence interval:
## -1.00000000 0.01162354
## sample estimates:
## prop 1 prop 2
## 0.1507024 0.1613774
# Manufacturing durables
prop.test(x = c(nrow(data.all.09fem[data.all.09fem$NAICS_18short == "ManuD",])
, nrow(data.all.19fem[data.all.19fem$NAICS_18short == "ManuD",]))
, n = c(nrow(data.all[data.all$SURVYEAR == 2009 & data.all$NAICS_18short == "ManuD",])
, nrow(data.all[data.all$SURVYEAR == 2019 & data.all$NAICS_18short == "ManuD",]))
, alternative = "greater"
, conf.level = 0.95)
##
## 2-sample test for equality of proportions with continuity correction
##
## data: c(nrow(data.all.09fem[data.all.09fem$NAICS_18short == "ManuD", ]), nrow(data.all.19fem[data.all.19fem$NAICS_18short == "ManuD", ])) out of c(nrow(data.all[data.all$SURVYEAR == 2009 & data.all$NAICS_18short == "ManuD", ]), nrow(data.all[data.all$SURVYEAR == 2019 & data.all$NAICS_18short == "ManuD", ]))
## X-squared = 0.11841, df = 1, p-value = 0.3654
## alternative hypothesis: greater
## 95 percent confidence interval:
## -0.01337678 1.00000000
## sample estimates:
## prop 1 prop 2
## 0.1972265 0.1933480
# Utilities
prop.test(x = c(nrow(data.all.09fem[data.all.09fem$NAICS_18short == "Utils",])
, nrow(data.all.19fem[data.all.19fem$NAICS_18short == "Utils",]))
, n = c(nrow(data.all[data.all$SURVYEAR == 2009 & data.all$NAICS_18short == "Utils",])
, nrow(data.all[data.all$SURVYEAR == 2019 & data.all$NAICS_18short == "Utils",]))
, alternative = "less"
, conf.level = 0.95)
##
## 2-sample test for equality of proportions with continuity correction
##
## data: c(nrow(data.all.09fem[data.all.09fem$NAICS_18short == "Utils", ]), nrow(data.all.19fem[data.all.19fem$NAICS_18short == "Utils", ])) out of c(nrow(data.all[data.all$SURVYEAR == 2009 & data.all$NAICS_18short == "Utils", ]), nrow(data.all[data.all$SURVYEAR == 2019 & data.all$NAICS_18short == "Utils", ]))
## X-squared = 0.13892, df = 1, p-value = 0.3547
## alternative hypothesis: less
## 95 percent confidence interval:
## -1.00000000 0.03201062
## sample estimates:
## prop 1 prop 2
## 0.2211690 0.2323232
# Transportation & Warehousing
prop.test(x = c(nrow(data.all.09fem[data.all.09fem$NAICS_18short == "Trans",])
, nrow(data.all.19fem[data.all.19fem$NAICS_18short == "Trans",]))
, n = c(nrow(data.all[data.all$SURVYEAR == 2009 & data.all$NAICS_18short == "Trans",])
, nrow(data.all[data.all$SURVYEAR == 2019 & data.all$NAICS_18short == "Trans",]))
, alternative = "less"
, conf.level = 0.95)
##
## 2-sample test for equality of proportions with continuity correction
##
## data: c(nrow(data.all.09fem[data.all.09fem$NAICS_18short == "Trans", ]), nrow(data.all.19fem[data.all.19fem$NAICS_18short == "Trans", ])) out of c(nrow(data.all[data.all$SURVYEAR == 2009 & data.all$NAICS_18short == "Trans", ]), nrow(data.all[data.all$SURVYEAR == 2019 & data.all$NAICS_18short == "Trans", ]))
## X-squared = 0.0096919, df = 1, p-value = 0.4608
## alternative hypothesis: less
## 95 percent confidence interval:
## -1.00000000 0.01918169
## sample estimates:
## prop 1 prop 2
## 0.2668506 0.2684642
# Wholesale Trade
prop.test(x = c(nrow(data.all.09fem[data.all.09fem$NAICS_18short == "Whole",])
, nrow(data.all.19fem[data.all.19fem$NAICS_18short == "Whole",]))
, n = c(nrow(data.all[data.all$SURVYEAR == 2009 & data.all$NAICS_18short == "Whole",])
, nrow(data.all[data.all$SURVYEAR == 2019 & data.all$NAICS_18short == "Whole",]))
, alternative = "less"
, conf.level = 0.95)
##
## 2-sample test for equality of proportions with continuity correction
##
## data: c(nrow(data.all.09fem[data.all.09fem$NAICS_18short == "Whole", ]), nrow(data.all.19fem[data.all.19fem$NAICS_18short == "Whole", ])) out of c(nrow(data.all[data.all$SURVYEAR == 2009 & data.all$NAICS_18short == "Whole", ]), nrow(data.all[data.all$SURVYEAR == 2019 & data.all$NAICS_18short == "Whole", ]))
## X-squared = 0.75298, df = 1, p-value = 0.1928
## alternative hypothesis: less
## 95 percent confidence interval:
## -1.0000000 0.0122707
## sample estimates:
## prop 1 prop 2
## 0.2888638 0.3031447
# Manufacturing non-durables
prop.test(x = c(nrow(data.all.09fem[data.all.09fem$NAICS_18short == "ManuN",])
, nrow(data.all.19fem[data.all.19fem$NAICS_18short == "ManuN",]))
, n = c(nrow(data.all[data.all$SURVYEAR == 2009 & data.all$NAICS_18short == "ManuN",])
, nrow(data.all[data.all$SURVYEAR == 2019 & data.all$NAICS_18short == "ManuN",]))
, alternative = "greater"
, conf.level = 0.95)
##
## 2-sample test for equality of proportions with continuity correction
##
## data: c(nrow(data.all.09fem[data.all.09fem$NAICS_18short == "ManuN", ]), nrow(data.all.19fem[data.all.19fem$NAICS_18short == "ManuN", ])) out of c(nrow(data.all[data.all$SURVYEAR == 2009 & data.all$NAICS_18short == "ManuN", ]), nrow(data.all[data.all$SURVYEAR == 2019 & data.all$NAICS_18short == "ManuN", ]))
## X-squared = 0.23825, df = 1, p-value = 0.3127
## alternative hypothesis: greater
## 95 percent confidence interval:
## -0.01560947 1.00000000
## sample estimates:
## prop 1 prop 2
## 0.3618397 0.3548527
# Agriculture
prop.test(x = c(nrow(data.all.09fem[data.all.09fem$NAICS_18short == "Agri",])
, nrow(data.all.19fem[data.all.19fem$NAICS_18short == "Agri",]))
, n = c(nrow(data.all[data.all$SURVYEAR == 2009 & data.all$NAICS_18short == "Agri",])
, nrow(data.all[data.all$SURVYEAR == 2019 & data.all$NAICS_18short == "Agri",]))
, alternative = "less"
, conf.level = 0.95)
##
## 2-sample test for equality of proportions with continuity correction
##
## data: c(nrow(data.all.09fem[data.all.09fem$NAICS_18short == "Agri", ]), nrow(data.all.19fem[data.all.19fem$NAICS_18short == "Agri", ])) out of c(nrow(data.all[data.all$SURVYEAR == 2009 & data.all$NAICS_18short == "Agri", ]), nrow(data.all[data.all$SURVYEAR == 2019 & data.all$NAICS_18short == "Agri", ]))
## X-squared = 3.4405, df = 1, p-value = 0.03181
## alternative hypothesis: less
## 95 percent confidence interval:
## -1.000000000 -0.005276911
## sample estimates:
## prop 1 prop 2
## 0.3077994 0.3554302
# Management, Admin. & Support
prop.test(x = c(nrow(data.all.09fem[data.all.09fem$NAICS_18short == "Mngt",])
, nrow(data.all.19fem[data.all.19fem$NAICS_18short == "Mngt",]))
, n = c(nrow(data.all[data.all$SURVYEAR == 2009 & data.all$NAICS_18short == "Mngt",])
, nrow(data.all[data.all$SURVYEAR == 2019 & data.all$NAICS_18short == "Mngt",]))
, alternative = "greater"
, conf.level = 0.95)
##
## 2-sample test for equality of proportions with continuity correction
##
## data: c(nrow(data.all.09fem[data.all.09fem$NAICS_18short == "Mngt", ]), nrow(data.all.19fem[data.all.19fem$NAICS_18short == "Mngt", ])) out of c(nrow(data.all[data.all$SURVYEAR == 2009 & data.all$NAICS_18short == "Mngt", ]), nrow(data.all[data.all$SURVYEAR == 2019 & data.all$NAICS_18short == "Mngt", ]))
## X-squared = 5.9742, df = 1, p-value = 0.007258
## alternative hypothesis: greater
## 95 percent confidence interval:
## 0.01358235 1.00000000
## sample estimates:
## prop 1 prop 2
## 0.4528406 0.4108575
# Prof., Scientific & Technical Services
prop.test(x = c(nrow(data.all.09fem[data.all.09fem$NAICS_18short == "ProSc",])
, nrow(data.all.19fem[data.all.19fem$NAICS_18short == "ProSc",]))
, n = c(nrow(data.all[data.all$SURVYEAR == 2009 & data.all$NAICS_18short == "ProSc",])
, nrow(data.all[data.all$SURVYEAR == 2019 & data.all$NAICS_18short == "ProSc",]))
, alternative = "greater"
, conf.level = 0.95)
##
## 2-sample test for equality of proportions with continuity correction
##
## data: c(nrow(data.all.09fem[data.all.09fem$NAICS_18short == "ProSc", ]), nrow(data.all.19fem[data.all.19fem$NAICS_18short == "ProSc", ])) out of c(nrow(data.all[data.all$SURVYEAR == 2009 & data.all$NAICS_18short == "ProSc", ]), nrow(data.all[data.all$SURVYEAR == 2019 & data.all$NAICS_18short == "ProSc", ]))
## X-squared = 3.7569, df = 1, p-value = 0.0263
## alternative hypothesis: greater
## 95 percent confidence interval:
## 0.00431574 1.00000000
## sample estimates:
## prop 1 prop 2
## 0.4972222 0.4683240
# Information, Culture & Recreation
prop.test(x = c(nrow(data.all.09fem[data.all.09fem$NAICS_18short == "Info",])
, nrow(data.all.19fem[data.all.19fem$NAICS_18short == "Info",]))
, n = c(nrow(data.all[data.all$SURVYEAR == 2009 & data.all$NAICS_18short == "Info",])
, nrow(data.all[data.all$SURVYEAR == 2019 & data.all$NAICS_18short == "Info",]))
, alternative = "greater"
, conf.level = 0.95)
##
## 2-sample test for equality of proportions with continuity correction
##
## data: c(nrow(data.all.09fem[data.all.09fem$NAICS_18short == "Info", ]), nrow(data.all.19fem[data.all.19fem$NAICS_18short == "Info", ])) out of c(nrow(data.all[data.all$SURVYEAR == 2009 & data.all$NAICS_18short == "Info", ]), nrow(data.all[data.all$SURVYEAR == 2019 & data.all$NAICS_18short == "Info", ]))
## X-squared = 1.826, df = 1, p-value = 0.0883
## alternative hypothesis: greater
## 95 percent confidence interval:
## -0.004620229 1.000000000
## sample estimates:
## prop 1 prop 2
## 0.4938215 0.4720129
# Public Administration
prop.test(x = c(nrow(data.all.09fem[data.all.09fem$NAICS_18short == "PubAd",])
, nrow(data.all.19fem[data.all.19fem$NAICS_18short == "PubAd",]))
, n = c(nrow(data.all[data.all$SURVYEAR == 2009 & data.all$NAICS_18short == "PubAd",])
, nrow(data.all[data.all$SURVYEAR == 2019 & data.all$NAICS_18short == "PubAd",]))
, alternative = "less"
, conf.level = 0.95)
##
## 2-sample test for equality of proportions with continuity correction
##
## data: c(nrow(data.all.09fem[data.all.09fem$NAICS_18short == "PubAd", ]), nrow(data.all.19fem[data.all.19fem$NAICS_18short == "PubAd", ])) out of c(nrow(data.all[data.all$SURVYEAR == 2009 & data.all$NAICS_18short == "PubAd", ]), nrow(data.all[data.all$SURVYEAR == 2019 & data.all$NAICS_18short == "PubAd", ]))
## X-squared = 0.48639, df = 1, p-value = 0.2428
## alternative hypothesis: less
## 95 percent confidence interval:
## -1.00000000 0.01108627
## sample estimates:
## prop 1 prop 2
## 0.4997339 0.5081690
# Other Services
prop.test(x = c(nrow(data.all.09fem[data.all.09fem$NAICS_18short == "Other",])
, nrow(data.all.19fem[data.all.19fem$NAICS_18short == "Other",]))
, n = c(nrow(data.all[data.all$SURVYEAR == 2009 & data.all$NAICS_18short == "Other",])
, nrow(data.all[data.all$SURVYEAR == 2019 & data.all$NAICS_18short == "Other",]))
, alternative = "greater"
, conf.level = 0.95)
##
## 2-sample test for equality of proportions with continuity correction
##
## data: c(nrow(data.all.09fem[data.all.09fem$NAICS_18short == "Other", ]), nrow(data.all.19fem[data.all.19fem$NAICS_18short == "Other", ])) out of c(nrow(data.all[data.all$SURVYEAR == 2009 & data.all$NAICS_18short == "Other", ]), nrow(data.all[data.all$SURVYEAR == 2019 & data.all$NAICS_18short == "Other", ]))
## X-squared = 2.3303, df = 1, p-value = 0.06344
## alternative hypothesis: greater
## 95 percent confidence interval:
## -0.001929488 1.000000000
## sample estimates:
## prop 1 prop 2
## 0.5481518 0.5227273
# Retail Trade
prop.test(x = c(nrow(data.all.09fem[data.all.09fem$NAICS_18short == "Rtail",])
, nrow(data.all.19fem[data.all.19fem$NAICS_18short == "Rtail",]))
, n = c(nrow(data.all[data.all$SURVYEAR == 2009 & data.all$NAICS_18short == "Rtail",])
, nrow(data.all[data.all$SURVYEAR == 2019 & data.all$NAICS_18short == "Rtail",]))
, alternative = "greater"
, conf.level = 0.95)
##
## 2-sample test for equality of proportions with continuity correction
##
## data: c(nrow(data.all.09fem[data.all.09fem$NAICS_18short == "Rtail", ]), nrow(data.all.19fem[data.all.19fem$NAICS_18short == "Rtail", ])) out of c(nrow(data.all[data.all$SURVYEAR == 2009 & data.all$NAICS_18short == "Rtail", ]), nrow(data.all[data.all$SURVYEAR == 2019 & data.all$NAICS_18short == "Rtail", ]))
## X-squared = 18.252, df = 1, p-value = 9.675e-06
## alternative hypothesis: greater
## 95 percent confidence interval:
## 0.0226837 1.0000000
## sample estimates:
## prop 1 prop 2
## 0.5812002 0.5441633
# Finance, Insurance, Real Est. & Leas.
prop.test(x = c(nrow(data.all.09fem[data.all.09fem$NAICS_18short == "Finan",])
, nrow(data.all.19fem[data.all.19fem$NAICS_18short == "Finan",]))
, n = c(nrow(data.all[data.all$SURVYEAR == 2009 & data.all$NAICS_18short == "Finan",])
, nrow(data.all[data.all$SURVYEAR == 2019 & data.all$NAICS_18short == "Finan",]))
, alternative = "greater"
, conf.level = 0.95)
##
## 2-sample test for equality of proportions with continuity correction
##
## data: c(nrow(data.all.09fem[data.all.09fem$NAICS_18short == "Finan", ]), nrow(data.all.19fem[data.all.19fem$NAICS_18short == "Finan", ])) out of c(nrow(data.all[data.all$SURVYEAR == 2009 & data.all$NAICS_18short == "Finan", ]), nrow(data.all[data.all$SURVYEAR == 2019 & data.all$NAICS_18short == "Finan", ]))
## X-squared = 18.143, df = 1, p-value = 1.025e-05
## alternative hypothesis: greater
## 95 percent confidence interval:
## 0.03481883 1.00000000
## sample estimates:
## prop 1 prop 2
## 0.6524696 0.5953711
# Accommodation & Food Services
prop.test(x = c(nrow(data.all.09fem[data.all.09fem$NAICS_18short == "AcFood",])
, nrow(data.all.19fem[data.all.19fem$NAICS_18short == "AcFood",]))
, n = c(nrow(data.all[data.all$SURVYEAR == 2009 & data.all$NAICS_18short == "AcFood",])
, nrow(data.all[data.all$SURVYEAR == 2019 & data.all$NAICS_18short == "AcFood",]))
, alternative = "greater"
, conf.level = 0.95)
##
## 2-sample test for equality of proportions with continuity correction
##
## data: c(nrow(data.all.09fem[data.all.09fem$NAICS_18short == "AcFood", ]), nrow(data.all.19fem[data.all.19fem$NAICS_18short == "AcFood", ])) out of c(nrow(data.all[data.all$SURVYEAR == 2009 & data.all$NAICS_18short == "AcFood", ]), nrow(data.all[data.all$SURVYEAR == 2019 & data.all$NAICS_18short == "AcFood", ]))
## X-squared = 1.8235, df = 1, p-value = 0.08845
## alternative hypothesis: greater
## 95 percent confidence interval:
## -0.003296079 1.000000000
## sample estimates:
## prop 1 prop 2
## 0.6379529 0.6225989
# Educational Services
prop.test(x = c(nrow(data.all.09fem[data.all.09fem$NAICS_18short == "Educa",])
, nrow(data.all.19fem[data.all.19fem$NAICS_18short == "Educa",]))
, n = c(nrow(data.all[data.all$SURVYEAR == 2009 & data.all$NAICS_18short == "Educa",])
, nrow(data.all[data.all$SURVYEAR == 2019 & data.all$NAICS_18short == "Educa",]))
, alternative = "less"
, conf.level = 0.95)
##
## 2-sample test for equality of proportions with continuity correction
##
## data: c(nrow(data.all.09fem[data.all.09fem$NAICS_18short == "Educa", ]), nrow(data.all.19fem[data.all.19fem$NAICS_18short == "Educa", ])) out of c(nrow(data.all[data.all$SURVYEAR == 2009 & data.all$NAICS_18short == "Educa", ]), nrow(data.all[data.all$SURVYEAR == 2019 & data.all$NAICS_18short == "Educa", ]))
## X-squared = 7.4584, df = 1, p-value = 0.003157
## alternative hypothesis: less
## 95 percent confidence interval:
## -1.0000000 -0.0105416
## sample estimates:
## prop 1 prop 2
## 0.6808045 0.7075028
# Health Care & Social Assistance
prop.test(x = c(nrow(data.all.09fem[data.all.09fem$NAICS_18short == "Health",])
, nrow(data.all.19fem[data.all.19fem$NAICS_18short == "Health",]))
, n = c(nrow(data.all[data.all$SURVYEAR == 2009 & data.all$NAICS_18short == "Health",])
, nrow(data.all[data.all$SURVYEAR == 2019 & data.all$NAICS_18short == "Health",]))
, alternative = "greater"
, conf.level = 0.95)
##
## 2-sample test for equality of proportions with continuity correction
##
## data: c(nrow(data.all.09fem[data.all.09fem$NAICS_18short == "Health", ]), nrow(data.all.19fem[data.all.19fem$NAICS_18short == "Health", ])) out of c(nrow(data.all[data.all$SURVYEAR == 2009 & data.all$NAICS_18short == "Health", ]), nrow(data.all[data.all$SURVYEAR == 2019 & data.all$NAICS_18short == "Health", ]))
## X-squared = 6.7094, df = 1, p-value = 0.004795
## alternative hypothesis: greater
## 95 percent confidence interval:
## 0.005639885 1.000000000
## sample estimates:
## prop 1 prop 2
## 0.8534341 0.8378633
# OCCUPATION ###################################################################
# Trades, transport & equipm. operators
prop.test(x = c(nrow(data.all.09fem[data.all.09fem$NOC_10short == "Trades",])
, nrow(data.all.19fem[data.all.19fem$NOC_10short == "Trades",]))
, n = c(nrow(data.all[data.all$SURVYEAR == 2009 & data.all$NOC_10short == "Trades",])
, nrow(data.all[data.all$SURVYEAR == 2019 & data.all$NOC_10short == "Trades",]))
, alternative = "less"
, conf.level = 0.95)
##
## 2-sample test for equality of proportions with continuity correction
##
## data: c(nrow(data.all.09fem[data.all.09fem$NOC_10short == "Trades", ]), nrow(data.all.19fem[data.all.19fem$NOC_10short == "Trades", ])) out of c(nrow(data.all[data.all$SURVYEAR == 2009 & data.all$NOC_10short == "Trades", ]), nrow(data.all[data.all$SURVYEAR == 2019 & data.all$NOC_10short == "Trades", ]))
## X-squared = 5.403, df = 1, p-value = 0.01005
## alternative hypothesis: less
## 95 percent confidence interval:
## -1.000000000 -0.002573939
## sample estimates:
## prop 1 prop 2
## 0.06091718 0.06989448
# Natural resources & agriculture
prop.test(x = c(nrow(data.all.09fem[data.all.09fem$NOC_10short == "NatAgri",])
, nrow(data.all.19fem[data.all.19fem$NOC_10short == "NatAgri",]))
, n = c(nrow(data.all[data.all$SURVYEAR == 2009 & data.all$NOC_10short == "NatAgri",])
, nrow(data.all[data.all$SURVYEAR == 2019 & data.all$NOC_10short == "NatAgri",]))
, alternative = "less"
, conf.level = 0.95)
##
## 2-sample test for equality of proportions with continuity correction
##
## data: c(nrow(data.all.09fem[data.all.09fem$NOC_10short == "NatAgri", ]), nrow(data.all.19fem[data.all.19fem$NOC_10short == "NatAgri", ])) out of c(nrow(data.all[data.all$SURVYEAR == 2009 & data.all$NOC_10short == "NatAgri", ]), nrow(data.all[data.all$SURVYEAR == 2019 & data.all$NOC_10short == "NatAgri", ]))
## X-squared = 8.4056, df = 1, p-value = 0.00187
## alternative hypothesis: less
## 95 percent confidence interval:
## -1.0000000 -0.0175078
## sample estimates:
## prop 1 prop 2
## 0.1682479 0.2092875
# Natural & applied sciences
prop.test(x = c(nrow(data.all.09fem[data.all.09fem$NOC_10short == "NatASc",])
, nrow(data.all.19fem[data.all.19fem$NOC_10short == "NatASc",]))
, n = c(nrow(data.all[data.all$SURVYEAR == 2009 & data.all$NOC_10short == "NatASc",])
, nrow(data.all[data.all$SURVYEAR == 2019 & data.all$NOC_10short == "NatASc",]))
, alternative = "less"
, conf.level = 0.95)
##
## 2-sample test for equality of proportions with continuity correction
##
## data: c(nrow(data.all.09fem[data.all.09fem$NOC_10short == "NatASc", ]), nrow(data.all.19fem[data.all.19fem$NOC_10short == "NatASc", ])) out of c(nrow(data.all[data.all$SURVYEAR == 2009 & data.all$NOC_10short == "NatASc", ]), nrow(data.all[data.all$SURVYEAR == 2019 & data.all$NOC_10short == "NatASc", ]))
## X-squared = 1.1179, df = 1, p-value = 0.1452
## alternative hypothesis: less
## 95 percent confidence interval:
## -1.000000000 0.005891185
## sample estimates:
## prop 1 prop 2
## 0.2213018 0.2322076
# Manufacturing & utilities
prop.test(x = c(nrow(data.all.09fem[data.all.09fem$NOC_10short == "ManUtil",])
, nrow(data.all.19fem[data.all.19fem$NOC_10short == "ManUtil",]))
, n = c(nrow(data.all[data.all$SURVYEAR == 2009 & data.all$NOC_10short == "ManUtil",])
, nrow(data.all[data.all$SURVYEAR == 2019 & data.all$NOC_10short == "ManUtil",]))
, alternative = "greater"
, conf.level = 0.95)
##
## 2-sample test for equality of proportions with continuity correction
##
## data: c(nrow(data.all.09fem[data.all.09fem$NOC_10short == "ManUtil", ]), nrow(data.all.19fem[data.all.19fem$NOC_10short == "ManUtil", ])) out of c(nrow(data.all[data.all$SURVYEAR == 2009 & data.all$NOC_10short == "ManUtil", ]), nrow(data.all[data.all$SURVYEAR == 2019 & data.all$NOC_10short == "ManUtil", ]))
## X-squared = 7.1194, df = 1, p-value = 0.003813
## alternative hypothesis: greater
## 95 percent confidence interval:
## 0.01237363 1.00000000
## sample estimates:
## prop 1 prop 2
## 0.2831858 0.2506759
# Management
prop.test(x = c(nrow(data.all.09fem[data.all.09fem$NOC_10short == "Mngt",])
, nrow(data.all.19fem[data.all.19fem$NOC_10short == "Mngt",]))
, n = c(nrow(data.all[data.all$SURVYEAR == 2009 & data.all$NOC_10short == "Mngt",])
, nrow(data.all[data.all$SURVYEAR == 2019 & data.all$NOC_10short == "Mngt",]))
, alternative = "greater"
, conf.level = 0.95)
##
## 2-sample test for equality of proportions with continuity correction
##
## data: c(nrow(data.all.09fem[data.all.09fem$NOC_10short == "Mngt", ]), nrow(data.all.19fem[data.all.19fem$NOC_10short == "Mngt", ])) out of c(nrow(data.all[data.all$SURVYEAR == 2009 & data.all$NOC_10short == "Mngt", ]), nrow(data.all[data.all$SURVYEAR == 2019 & data.all$NOC_10short == "Mngt", ]))
## X-squared = 0.68827, df = 1, p-value = 0.2034
## alternative hypothesis: greater
## 95 percent confidence interval:
## -0.01000759 1.00000000
## sample estimates:
## prop 1 prop 2
## 0.4310008 0.4205007
# Art, culture, recreation & sport
prop.test(x = c(nrow(data.all.09fem[data.all.09fem$NOC_10short == "ArtCul",])
, nrow(data.all.19fem[data.all.19fem$NOC_10short == "ArtCul",]))
, n = c(nrow(data.all[data.all$SURVYEAR == 2009 & data.all$NOC_10short == "ArtCul",])
, nrow(data.all[data.all$SURVYEAR == 2019 & data.all$NOC_10short == "ArtCul",]))
, alternative = "greater"
, conf.level = 0.95)
##
## 2-sample test for equality of proportions with continuity correction
##
## data: c(nrow(data.all.09fem[data.all.09fem$NOC_10short == "ArtCul", ]), nrow(data.all.19fem[data.all.19fem$NOC_10short == "ArtCul", ])) out of c(nrow(data.all[data.all$SURVYEAR == 2009 & data.all$NOC_10short == "ArtCul", ]), nrow(data.all[data.all$SURVYEAR == 2019 & data.all$NOC_10short == "ArtCul", ]))
## X-squared = 1.8359, df = 1, p-value = 0.08771
## alternative hypothesis: greater
## 95 percent confidence interval:
## -0.006708564 1.000000000
## sample estimates:
## prop 1 prop 2
## 0.605042 0.572590
# Sales & service
prop.test(x = c(nrow(data.all.09fem[data.all.09fem$NOC_10short == "Sales",])
, nrow(data.all.19fem[data.all.19fem$NOC_10short == "Sales",]))
, n = c(nrow(data.all[data.all$SURVYEAR == 2009 & data.all$NOC_10short == "Sales",])
, nrow(data.all[data.all$SURVYEAR == 2019 & data.all$NOC_10short == "Sales",]))
, alternative = "greater"
, conf.level = 0.95)
##
## 2-sample test for equality of proportions with continuity correction
##
## data: c(nrow(data.all.09fem[data.all.09fem$NOC_10short == "Sales", ]), nrow(data.all.19fem[data.all.19fem$NOC_10short == "Sales", ])) out of c(nrow(data.all[data.all$SURVYEAR == 2009 & data.all$NOC_10short == "Sales", ]), nrow(data.all[data.all$SURVYEAR == 2019 & data.all$NOC_10short == "Sales", ]))
## X-squared = 13.65, df = 1, p-value = 0.0001101
## alternative hypothesis: greater
## 95 percent confidence interval:
## 0.01223069 1.00000000
## sample estimates:
## prop 1 prop 2
## 0.5964467 0.5743233
# Educ., law, community & gov. services
prop.test(x = c(nrow(data.all.09fem[data.all.09fem$NOC_10short == "EduLaw",])
, nrow(data.all.19fem[data.all.19fem$NOC_10short == "EduLaw",]))
, n = c(nrow(data.all[data.all$SURVYEAR == 2009 & data.all$NOC_10short == "EduLaw",])
, nrow(data.all[data.all$SURVYEAR == 2019 & data.all$NOC_10short == "EduLaw",]))
, alternative = "less"
, conf.level = 0.95)
##
## 2-sample test for equality of proportions with continuity correction
##
## data: c(nrow(data.all.09fem[data.all.09fem$NOC_10short == "EduLaw", ]), nrow(data.all.19fem[data.all.19fem$NOC_10short == "EduLaw", ])) out of c(nrow(data.all[data.all$SURVYEAR == 2009 & data.all$NOC_10short == "EduLaw", ]), nrow(data.all[data.all$SURVYEAR == 2019 & data.all$NOC_10short == "EduLaw", ]))
## X-squared = 0.28984, df = 1, p-value = 0.2952
## alternative hypothesis: less
## 95 percent confidence interval:
## -1.000000000 0.009369943
## sample estimates:
## prop 1 prop 2
## 0.7124247 0.7171559
# Business, finance & administration
prop.test(x = c(nrow(data.all.09fem[data.all.09fem$NOC_10short == "BusFin",])
, nrow(data.all.19fem[data.all.19fem$NOC_10short == "BusFin",]))
, n = c(nrow(data.all[data.all$SURVYEAR == 2009 & data.all$NOC_10short == "BusFin",])
, nrow(data.all[data.all$SURVYEAR == 2019 & data.all$NOC_10short == "BusFin",]))
, alternative = "greater"
, conf.level = 0.95)
##
## 2-sample test for equality of proportions with continuity correction
##
## data: c(nrow(data.all.09fem[data.all.09fem$NOC_10short == "BusFin", ]), nrow(data.all.19fem[data.all.19fem$NOC_10short == "BusFin", ])) out of c(nrow(data.all[data.all$SURVYEAR == 2009 & data.all$NOC_10short == "BusFin", ]), nrow(data.all[data.all$SURVYEAR == 2019 & data.all$NOC_10short == "BusFin", ]))
## X-squared = 5.3661, df = 1, p-value = 0.01027
## alternative hypothesis: greater
## 95 percent confidence interval:
## 0.004375397 1.000000000
## sample estimates:
## prop 1 prop 2
## 0.7521890 0.7369202
# Health
prop.test(x = c(nrow(data.all.09fem[data.all.09fem$NOC_10short == "Health",])
, nrow(data.all.19fem[data.all.19fem$NOC_10short == "Health",]))
, n = c(nrow(data.all[data.all$SURVYEAR == 2009 & data.all$NOC_10short == "Health",])
, nrow(data.all[data.all$SURVYEAR == 2019 & data.all$NOC_10short == "Health",]))
, alternative = "greater"
, conf.level = 0.95)
##
## 2-sample test for equality of proportions with continuity correction
##
## data: c(nrow(data.all.09fem[data.all.09fem$NOC_10short == "Health", ]), nrow(data.all.19fem[data.all.19fem$NOC_10short == "Health", ])) out of c(nrow(data.all[data.all$SURVYEAR == 2009 & data.all$NOC_10short == "Health", ]), nrow(data.all[data.all$SURVYEAR == 2019 & data.all$NOC_10short == "Health", ]))
## X-squared = 5.5633, df = 1, p-value = 0.00917
## alternative hypothesis: greater
## 95 percent confidence interval:
## 0.005612905 1.000000000
## sample estimates:
## prop 1 prop 2
## 0.8661738 0.8474368